Merge commit 'linux-2.6/master' (early part) into oprofile/core

author Robert Richter <robert.richter@amd.com>

Mon, 25 Oct 2010 14:28:14 +0000 (16:28 +0200)

committer Robert Richter <robert.richter@amd.com>

Mon, 25 Oct 2010 14:29:12 +0000 (16:29 +0200)
author Robert Richter <robert.richter@amd.com>
Mon, 25 Oct 2010 14:28:14 +0000 (16:28 +0200)
committer Robert Richter <robert.richter@amd.com>
Mon, 25 Oct 2010 14:29:12 +0000 (16:29 +0200)
diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl

index 1448b33..fb10fd0 100644 (file)
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -28,7 +28,7 @@
    </authorgroup>
  
    <copyright>
-   <year>2005-2006</year>
+   <year>2005-2010</year>
     <holder>Thomas Gleixner</holder>
    </copyright>
    <copyright>
@@ -100,6 +100,10 @@
           <listitem><para>Edge type</para></listitem>
           <listitem><para>Simple type</para></listitem>
         </itemizedlist>
+       During the implementation we identified another type:
+       <itemizedlist>
+         <listitem><para>Fast EOI type</para></listitem>
+       </itemizedlist>
         In the SMP world of the __do_IRQ() super-handler another type
         was identified:
         <itemizedlist>
@@ -153,6 +157,7 @@
         is still available. This leads to a kind of duality for the time
         being. Over time the new model should be used in more and more
         architectures, as it enables smaller and cleaner IRQ subsystems.
+       It's deprecated for three years now and about to be removed.
         </para>
    </chapter>
    <chapter id="bugs">
@@ -217,6 +222,7 @@
           <itemizedlist>
           <listitem><para>handle_level_irq</para></listitem>
           <listitem><para>handle_edge_irq</para></listitem>
+         <listitem><para>handle_fasteoi_irq</para></listitem>
           <listitem><para>handle_simple_irq</para></listitem>
           <listitem><para>handle_percpu_irq</para></listitem>
           </itemizedlist>
@@ -233,33 +239,33 @@
                 are used by the default flow implementations.
                 The following helper functions are implemented (simplified excerpt):
                 <programlisting>
-default_enable(irq)
+default_enable(struct irq_data *data)
  {
-       desc->chip->unmask(irq);
+       desc->chip->irq_unmask(data);
  }
  
-default_disable(irq)
+default_disable(struct irq_data *data)
  {
-       if (!delay_disable(irq))
-               desc->chip->mask(irq);
+       if (!delay_disable(data))
+               desc->chip->irq_mask(data);
  }
  
-default_ack(irq)
+default_ack(struct irq_data *data)
  {
-       chip->ack(irq);
+       chip->irq_ack(data);
  }
  
-default_mask_ack(irq)
+default_mask_ack(struct irq_data *data)
  {
-       if (chip->mask_ack) {
-               chip->mask_ack(irq);
+       if (chip->irq_mask_ack) {
+               chip->irq_mask_ack(data);
         } else {
-               chip->mask(irq);
-               chip->ack(irq);
+               chip->irq_mask(data);
+               chip->irq_ack(data);
         }
  }
  
-noop(irq)
+noop(struct irq_data *data))
  {
  }
  
@@ -278,12 +284,27 @@ noop(irq)
                 <para>
                 The following control flow is implemented (simplified excerpt):
                 <programlisting>
-desc->chip->start();
+desc->chip->irq_mask();
  handle_IRQ_event(desc->action);
-desc->chip->end();
+desc->chip->irq_unmask();
                 </programlisting>
                 </para>
-           </sect3>
+           </sect3>
+           <sect3 id="Default_FASTEOI_IRQ_flow_handler">
+               <title>Default Fast EOI IRQ flow handler</title>
+               <para>
+               handle_fasteoi_irq provides a generic implementation
+               for interrupts, which only need an EOI at the end of
+               the handler
+               </para>
+               <para>
+               The following control flow is implemented (simplified excerpt):
+               <programlisting>
+handle_IRQ_event(desc->action);
+desc->chip->irq_eoi();
+               </programlisting>
+               </para>
+           </sect3>
             <sect3 id="Default_Edge_IRQ_flow_handler">
                 <title>Default Edge IRQ flow handler</title>
                 <para>
@@ -294,20 +315,19 @@ desc->chip->end();
                 The following control flow is implemented (simplified excerpt):
                 <programlisting>
  if (desc->status &amp; running) {
-       desc->chip->hold();
+       desc->chip->irq_mask();
         desc->status |= pending | masked;
         return;
  }
-desc->chip->start();
+desc->chip->irq_ack();
  desc->status |= running;
  do {
         if (desc->status &amp; masked)
-               desc->chip->enable();
+               desc->chip->irq_unmask();
         desc->status &amp;= ~pending;
         handle_IRQ_event(desc->action);
  } while (status &amp; pending);
  desc->status &amp;= ~running;
-desc->chip->end();
                 </programlisting>
                 </para>
             </sect3>
@@ -342,9 +362,9 @@ handle_IRQ_event(desc->action);
                 <para>
                 The following control flow is implemented (simplified excerpt):
                 <programlisting>
-desc->chip->start();
  handle_IRQ_event(desc->action);
-desc->chip->end();
+if (desc->chip->irq_eoi)
+        desc->chip->irq_eoi();
                 </programlisting>
                 </para>
             </sect3>
@@ -375,8 +395,7 @@ desc->chip->end();
         mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
         you want to use the delayed interrupt disable feature and your
         hardware is not capable of retriggering an interrupt.)
-       The delayed interrupt disable can be runtime enabled, per interrupt,
-       by setting the IRQ_DELAYED_DISABLE flag in the irq_desc status field.
+       The delayed interrupt disable is not configurable.
         </para>
         </sect2>
      </sect1>
@@ -387,13 +406,13 @@ desc->chip->end();
         contains all the direct chip relevant functions, which
         can be utilized by the irq flow implementations.
           <itemizedlist>
-         <listitem><para>ack()</para></listitem>
-         <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem>
-         <listitem><para>mask()</para></listitem>
-         <listitem><para>unmask()</para></listitem>
-         <listitem><para>retrigger() - Optional</para></listitem>
-         <listitem><para>set_type() - Optional</para></listitem>
-         <listitem><para>set_wake() - Optional</para></listitem>
+         <listitem><para>irq_ack()</para></listitem>
+         <listitem><para>irq_mask_ack() - Optional, recommended for performance</para></listitem>
+         <listitem><para>irq_mask()</para></listitem>
+         <listitem><para>irq_unmask()</para></listitem>
+         <listitem><para>irq_retrigger() - Optional</para></listitem>
+         <listitem><para>irq_set_type() - Optional</para></listitem>
+         <listitem><para>irq_set_wake() - Optional</para></listitem>
           </itemizedlist>
         These primitives are strictly intended to mean what they say: ack means
         ACK, masking means masking of an IRQ line, etc. It is up to the flow
@@ -458,6 +477,7 @@ desc->chip->end();
       <para>
       This chapter contains the autogenerated documentation of the internal functions.
       </para>
+!Ikernel/irq/irqdesc.c
  !Ikernel/irq/handle.c
  !Ikernel/irq/chip.c
    </chapter>
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl

index a0d479d..f66f4df 100644 (file)
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
        all the readers who were traversing the list when we deleted the
        element are finished.  We use <function>call_rcu()</function> to
        register a callback which will actually destroy the object once
-      the readers are finished.
+      all pre-existing readers are finished.  Alternatively,
+      <function>synchronize_rcu()</function> may be used to block until
+      all pre-existing are finished.
      </para>
      <para>
        But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
  -        object_put(obj);
  +        list_del_rcu(&amp;obj-&gt;list);
           cache_num--;
-+        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
++        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
   }
  
   /* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
           if (++cache_num > MAX_CACHE_SIZE) {
                   struct object *i, *outcast = NULL;
                   list_for_each_entry(i, &amp;cache, list) {
-@@ -85,6 +94,7 @@
-         obj-&gt;popularity = 0;
-         atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
-         spin_lock_init(&amp;obj-&gt;lock);
-+        INIT_RCU_HEAD(&amp;obj-&gt;rcu);
-
-         spin_lock_irqsave(&amp;cache_lock, flags);
-         __cache_add(obj);
  @@ -104,12 +114,11 @@
   struct object *cache_find(int id)
   {
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt

index 790d1a8..0c134f8 100644 (file)
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
         include:
  
         a.      Keeping a count of the number of data-structure elements
-               used by the RCU-protected data structure, including those
-               waiting for a grace period to elapse.  Enforce a limit
-               on this number, stalling updates as needed to allow
-               previously deferred frees to complete.
-
-               Alternatively, limit only the number awaiting deferred
-               free rather than the total number of elements.
+               used by the RCU-protected data structure, including
+               those waiting for a grace period to elapse.  Enforce a
+               limit on this number, stalling updates as needed to allow
+               previously deferred frees to complete.  Alternatively,
+               limit only the number awaiting deferred free rather than
+               the total number of elements.
+
+               One way to stall the updates is to acquire the update-side
+               mutex.  (Don't try this with a spinlock -- other CPUs
+               spinning on the lock could prevent the grace period
+               from ever ending.)  Another way to stall the updates
+               is for the updates to use a wrapper function around
+               the memory allocator, so that this wrapper function
+               simulates OOM when there is too much memory awaiting an
+               RCU grace period.  There are of course many other
+               variations on this theme.
  
         b.      Limiting update rate.  For example, if updates occur only
                 once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
         and the compiler to freely reorder code into and out of RCU
         read-side critical sections.  It is the responsibility of the
         RCU update-side primitives to deal with this.
+
+17.    Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
+       the __rcu sparse checks to validate your RCU code.  These
+       can help find problems as follows:
+
+       CONFIG_PROVE_RCU: check that accesses to RCU-protected data
+               structures are carried out under the proper RCU
+               read-side critical section, while holding the right
+               combination of locks, or whatever other conditions
+               are appropriate.
+
+       CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+               same object to call_rcu() (or friends) before an RCU
+               grace period has elapsed since the last time that you
+               passed that same object to call_rcu() (or friends).
+
+       __rcu sparse checks: tag the pointer to the RCU-protected data
+               structure with __rcu, and sparse will warn you if you
+               access that pointer without the services of one of the
+               variants of rcu_dereference().
+
+       These debugging aids can help you find problems that are
+       otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt

index 44c6dcc..862c08e 100644 (file)
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o    A CPU looping with bottom halves disabled.  This condition can
  o      For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
         without invoking schedule().
  
+o      A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+       happen to preempt a low-priority task in the middle of an RCU
+       read-side critical section.   This is especially damaging if
+       that low-priority task is not permitted to run on any other CPU,
+       in which case the next RCU grace period can never complete, which
+       will eventually cause the system to run out of memory and hang.
+       While the system is in the process of running itself out of
+       memory, you might see stall-warning messages.
+
+o      A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+       is running at a higher priority than the RCU softirq threads.
+       This will prevent RCU callbacks from ever being invoked,
+       and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
+       RCU grace periods from ever completing.  Either way, the
+       system will eventually run out of memory and hang.  In the
+       CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
+       messages.
+
  o      A bug in the RCU implementation.
  
  o      A hardware failure.  This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt

index efd8cc9..a851118 100644 (file)
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o  "b" is the batch limit for this CPU.  If more than this number
         of RCU callbacks is ready to invoke, then the remainder will
         be deferred.
  
+o      "ci" is the number of RCU callbacks that have been invoked for
+       this CPU.  Note that ci+ql is the number of callbacks that have
+       been registered in absence of CPU-hotplug activity.
+
+o      "co" is the number of RCU callbacks that have been orphaned due to
+       this CPU going offline.
+
+o      "ca" is the number of RCU callbacks that have been adopted due to
+       other CPUs going offline.  Note that ci+co-ca+ql is the number of
+       RCU callbacks registered on this CPU.
+
  There is also an rcu/rcudata.csv file with the same information in
  comma-separated-variable spreadsheet format.
  
@@ -180,7 +191,7 @@ o   "s" is the "signaled" state that drives force_quiescent_state()'s
  
  o      "jfq" is the number of jiffies remaining for this grace period
         before force_quiescent_state() is invoked to help push things
-       along.  Note that CPUs in dyntick-idle mode thoughout the grace
+       along.  Note that CPUs in dyntick-idle mode throughout the grace
         period will not report on their own, but rather must be check by
         some other CPU via force_quiescent_state().
  
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt

index f1c5c4b..902d315 100644 (file)
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
         identifier (rather than the kernel's).  The actual value is
         architecture and platform dependent.
  
-3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
+3) /sys/devices/system/cpu/cpuX/topology/book_id:
+
+       the book ID of cpuX. Typically it is the hardware platform's
+       identifier (rather than the kernel's).  The actual value is
+       architecture and platform dependent.
+
+4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
  
         internel kernel map of cpuX's hardware threads within the same
         core as cpuX
  
-4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
+5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
  
         internal kernel map of cpuX's hardware threads within the same
         physical_package_id.
  
+6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
+
+       internal kernel map of cpuX's hardware threads within the same
+       book_id.
+
  To implement it in an architecture-neutral way, a new source file,
-drivers/base/topology.c, is to export the 4 attributes.
+drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
+related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
  
  For an architecture to support this feature, it must define some of
  these macros in include/asm-XXX/topology.h:
  #define topology_physical_package_id(cpu)
  #define topology_core_id(cpu)
+#define topology_book_id(cpu)
  #define topology_thread_cpumask(cpu)
  #define topology_core_cpumask(cpu)
+#define topology_book_cpumask(cpu)
  
  The type of **_id is int.
  The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
  3) thread_siblings: just the given CPU
  4) core_siblings: just the given CPU
  
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+
  Additionally, CPU topology information is provided under
  /sys/devices/system/cpu and includes these files.  The internal
  source for the output is in brackets ("[]").
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt

index 842aa9d..5e2bc4a 100644 (file)
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -386,34 +386,6 @@ Who:       Tejun Heo <tj@kernel.org>
  
  ----------------------------
  
-What:  Support for VMware's guest paravirtuliazation technique [VMI] will be
-       dropped.
-When:  2.6.37 or earlier.
-Why:   With the recent innovations in CPU hardware acceleration technologies
-       from Intel and AMD, VMware ran a few experiments to compare these
-       techniques to guest paravirtualization technique on VMware's platform.
-       These hardware assisted virtualization techniques have outperformed the
-       performance benefits provided by VMI in most of the workloads. VMware
-       expects that these hardware features will be ubiquitous in a couple of
-       years, as a result, VMware has started a phased retirement of this
-       feature from the hypervisor. We will be removing this feature from the
-       Kernel too. Right now we are targeting 2.6.37 but can retire earlier if
-       technical reasons (read opportunity to remove major chunk of pvops)
-       arise.
-
-       Please note that VMI has always been an optimization and non-VMI kernels
-       still work fine on VMware's platform.
-       Latest versions of VMware's product which support VMI are,
-       Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
-       releases for these products will continue supporting VMI.
-
-       For more details about VMI retirement take a look at this,
-       http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html
-
-Who:   Alok N Kataria <akataria@vmware.com>
-
-----------------------------
-
  What:  Support for lcd_switch and display_get in asus-laptop driver
  When:  March 2010
  Why:   These two features use non-standard interfaces. There are the
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 8dd7248..3a0009e 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -455,7 +455,7 @@ and is between 256 and 4096 characters. It is defined in the file
                         [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
                                 pxa_timer,timer3,32k_counter,timer0_1
                         [AVR32] avr32
-                       [X86-32] pit,hpet,tsc,vmi-timer;
+                       [X86-32] pit,hpet,tsc;
                                 scx200_hrt on Geode; cyclone on IBM x440
                         [MIPS] MIPS
                         [PARISC] cr16
@@ -2153,6 +2153,11 @@ and is between 256 and 4096 characters. It is defined in the file
                         Reserves a hole at the top of the kernel virtual
                         address space.
  
+       reservelow=     [X86]
+                       Format: nn[K]
+                       Set the amount of memory to reserve for BIOS at
+                       the bottom of the address space.
+
         reset_devices   [KNL] Force drivers to reset the underlying device
                         during initialization.
  
@@ -2435,6 +2440,10 @@ and is between 256 and 4096 characters. It is defined in the file
                         disables clocksource verification at runtime.
                         Used to enable high-resolution timer mode on older
                         hardware, and in virtualized environment.
+                       [x86] noirqtime: Do not use TSC to do irq accounting.
+                       Used to run time disable IRQ_TIME_ACCOUNTING on any
+                       platforms where RDTSC is slow and this accounting
+                       can add overhead.
  
         turbografx.map[2|3]=    [HW,JOY]
                         TurboGraFX parallel port interface
diff --git a/MAINTAINERS b/MAINTAINERS

index f2a2b8e..6f5b5b2 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T:        git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
  S:     Supported
  F:     Documentation/filesystems/ceph.txt
  F:     fs/ceph
+F:     net/ceph
+F:     include/linux/ceph
  
  CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
  M:     David Vrabel <david.vrabel@csr.com>
@@ -3239,6 +3241,12 @@ F:       drivers/net/irda/
  F:     include/net/irda/
  F:     net/irda/
  
+IRQ SUBSYSTEM
+M:     Thomas Gleixner <tglx@linutronix.de>
+S:     Maintained
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git irq/core
+F:     kernel/irq/
+
  ISAPNP
  M:     Jaroslav Kysela <perex@perex.cz>
  S:     Maintained
@@ -4805,6 +4813,15 @@ F:       fs/qnx4/
  F:     include/linux/qnx4_fs.h
  F:     include/linux/qnxtypes.h
  
+RADOS BLOCK DEVICE (RBD)
+F:     include/linux/qnxtypes.h
+M:     Yehuda Sadeh <yehuda@hq.newdream.net>
+M:     Sage Weil <sage@newdream.net>
+M:     ceph-devel@vger.kernel.org
+S:     Supported
+F:     drivers/block/rbd.c
+F:     drivers/block/rbd_types.h
+
  RADEON FRAMEBUFFER DISPLAY DRIVER
  M:     Benjamin Herrenschmidt <benh@kernel.crashing.org>
  L:     linux-fbdev@vger.kernel.org
diff --git a/arch/arm/include/asm/hw_irq.h b/arch/arm/include/asm/hw_irq.h

index 90831f6..5586b7c 100644 (file)
--- a/arch/arm/include/asm/hw_irq.h
+++ b/arch/arm/include/asm/hw_irq.h
@@ -24,4 +24,6 @@ void set_irq_flags(unsigned int irq, unsigned int flags);
  #define IRQF_PROBE     (1 << 1)
  #define IRQF_NOAUTOEN  (1 << 2)
  
+#define ARCH_IRQ_INIT_FLAGS    (IRQ_NOREQUEST | IRQ_NOPROBE)
+
  #endif
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c

index c0d5c3b..36ad3be 100644 (file)
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -154,14 +154,6 @@ void set_irq_flags(unsigned int irq, unsigned int iflags)
  
  void __init init_IRQ(void)
  {
-       struct irq_desc *desc;
-       int irq;
-
-       for (irq = 0; irq < nr_irqs; irq++) {
-               desc = irq_to_desc_alloc_node(irq, 0);
-               desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
-       }
-
         init_arch_irq();
  }
  
@@ -169,7 +161,7 @@ void __init init_IRQ(void)
  int __init arch_probe_nr_irqs(void)
  {
         nr_irqs = arch_nr_irqs ? arch_nr_irqs : NR_IRQS;
-       return 0;
+       return nr_irqs;
  }
  #endif
  
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c

index 29c0a91..77eb35c 100644 (file)
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
  
         memset(&gDMA, 0, sizeof(gDMA));
  
-       init_MUTEX_LOCKED(&gDMA.lock);
+       sema_init(&gDMA.lock, 0);
         init_waitqueue_head(&gDMA.freeChannelQ);
  
         /* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
  {
         memset(memMap, 0, sizeof(*memMap));
  
-       init_MUTEX(&memMap->lock);
+       sema_init(&memMap->lock, 1);
  
         return 0;
  }
diff --git a/arch/arm/mach-bcmring/irq.c b/arch/arm/mach-bcmring/irq.c

index dc1c493..e315263 100644 (file)
--- a/arch/arm/mach-bcmring/irq.c
+++ b/arch/arm/mach-bcmring/irq.c
@@ -67,21 +67,21 @@ static void bcmring_unmask_irq2(unsigned int irq)
  }
  
  static struct irq_chip bcmring_irq0_chip = {
-       .typename = "ARM-INTC0",
+       .name = "ARM-INTC0",
         .ack = bcmring_mask_irq0,
         .mask = bcmring_mask_irq0,      /* mask a specific interrupt, blocking its delivery. */
         .unmask = bcmring_unmask_irq0,  /* unmaks an interrupt */
  };
  
  static struct irq_chip bcmring_irq1_chip = {
-       .typename = "ARM-INTC1",
+       .name = "ARM-INTC1",
         .ack = bcmring_mask_irq1,
         .mask = bcmring_mask_irq1,
         .unmask = bcmring_unmask_irq1,
  };
  
  static struct irq_chip bcmring_irq2_chip = {
-       .typename = "ARM-SINTC",
+       .name = "ARM-SINTC",
         .ack = bcmring_mask_irq2,
         .mask = bcmring_mask_irq2,
         .unmask = bcmring_unmask_irq2,
diff --git a/arch/arm/mach-iop13xx/msi.c b/arch/arm/mach-iop13xx/msi.c

index f34b0ed..7149fcc 100644 (file)
--- a/arch/arm/mach-iop13xx/msi.c
+++ b/arch/arm/mach-iop13xx/msi.c
@@ -164,10 +164,10 @@ static void iop13xx_msi_nop(unsigned int irq)
  static struct irq_chip iop13xx_msi_chip = {
         .name = "PCI-MSI",
         .ack = iop13xx_msi_nop,
-       .enable = unmask_msi_irq,
-       .disable = mask_msi_irq,
-       .mask = mask_msi_irq,
-       .unmask = unmask_msi_irq,
+       .irq_enable = unmask_msi_irq,
+       .irq_disable = mask_msi_irq,
+       .irq_mask = mask_msi_irq,
+       .irq_unmask = unmask_msi_irq,
  };
  
  int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h

index d514cd9..8fb7d33 100644 (file)
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
   *     David Mosberger-Tang <davidm@hpl.hp.com>
   */
  
-
-#include <linux/threads.h>
-#include <linux/irq.h>
-
-#include <asm/processor.h>
-
  /*
   * No irq_cpustat_t for IA-64.  The data is held in the per-CPU data structure.
   */
@@ -20,6 +14,11 @@
  
  #define local_softirq_pending()                (local_cpu_data->softirq_pending)
  
+#include <linux/threads.h>
+#include <linux/irq.h>
+
+#include <asm/processor.h>
+
  extern void __iomem *ipi_base_addr;
  
  void ack_bad_irq(unsigned int irq);
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h

index 9f342a5..dd028f2 100644 (file)
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
  
  void default_idle(void);
  
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
  #endif /* __KERNEL__ */
  
  #endif /* __ASSEMBLY__ */
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c

index 4a746ea..00b19a4 100644 (file)
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -104,8 +104,8 @@ static int ia64_msi_retrigger_irq(unsigned int irq)
   */
  static struct irq_chip ia64_msi_chip = {
         .name           = "PCI-MSI",
-       .mask           = mask_msi_irq,
-       .unmask         = unmask_msi_irq,
+       .irq_mask       = mask_msi_irq,
+       .irq_unmask     = unmask_msi_irq,
         .ack            = ia64_ack_msi_irq,
  #ifdef CONFIG_SMP
         .set_affinity   = ia64_set_msi_irq_affinity,
@@ -160,8 +160,8 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  
  static struct irq_chip dmar_msi_type = {
         .name = "DMAR_MSI",
-       .unmask = dmar_msi_unmask,
-       .mask = dmar_msi_mask,
+       .irq_unmask = dmar_msi_unmask,
+       .irq_mask = dmar_msi_mask,
         .ack = ia64_ack_msi_irq,
  #ifdef CONFIG_SMP
         .set_affinity = dmar_msi_set_affinity,
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c

index 0c72dd4..a5e500f 100644 (file)
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -228,8 +228,8 @@ static int sn_msi_retrigger_irq(unsigned int irq)
  
  static struct irq_chip sn_msi_chip = {
         .name           = "PCI-MSI",
-       .mask           = mask_msi_irq,
-       .unmask         = unmask_msi_irq,
+       .irq_mask       = mask_msi_irq,
+       .irq_unmask     = unmask_msi_irq,
         .ack            = sn_ack_msi_irq,
  #ifdef CONFIG_SMP
         .set_affinity   = sn_set_msi_irq_affinity,
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c

index 3c71f77..7db26f1 100644 (file)
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -51,7 +51,7 @@ int show_interrupts(struct seq_file *p, void *v)
                 for_each_online_cpu(j)
                         seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
  #endif
-               seq_printf(p, " %14s", irq_desc[i].chip->typename);
+               seq_printf(p, " %14s", irq_desc[i].chip->name);
                 seq_printf(p, "  %s", action->name);
  
                 for (action=action->next; action; action = action->next)
diff --git a/arch/m32r/platforms/m32104ut/setup.c b/arch/m32r/platforms/m32104ut/setup.c

index 922fdfd..402a59d 100644 (file)
--- a/arch/m32r/platforms/m32104ut/setup.c
+++ b/arch/m32r/platforms/m32104ut/setup.c
@@ -65,7 +65,7 @@ static void shutdown_m32104ut_irq(unsigned int irq)
  
  static struct irq_chip m32104ut_irq_type =
  {
-       .typename = "M32104UT-IRQ",
+       .name = "M32104UT-IRQ",
         .startup = startup_m32104ut_irq,
         .shutdown = shutdown_m32104ut_irq,
         .enable = enable_m32104ut_irq,
diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c

index 9c1bc74..80b1a02 100644 (file)
--- a/arch/m32r/platforms/m32700ut/setup.c
+++ b/arch/m32r/platforms/m32700ut/setup.c
@@ -71,7 +71,7 @@ static void shutdown_m32700ut_irq(unsigned int irq)
  
  static struct irq_chip m32700ut_irq_type =
  {
-       .typename = "M32700UT-IRQ",
+       .name = "M32700UT-IRQ",
         .startup = startup_m32700ut_irq,
         .shutdown = shutdown_m32700ut_irq,
         .enable = enable_m32700ut_irq,
@@ -148,7 +148,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
  
  static struct irq_chip m32700ut_pld_irq_type =
  {
-       .typename = "M32700UT-PLD-IRQ",
+       .name = "M32700UT-PLD-IRQ",
         .startup = startup_m32700ut_pld_irq,
         .shutdown = shutdown_m32700ut_pld_irq,
         .enable = enable_m32700ut_pld_irq,
@@ -217,7 +217,7 @@ static void shutdown_m32700ut_lanpld_irq(unsigned int irq)
  
  static struct irq_chip m32700ut_lanpld_irq_type =
  {
-       .typename = "M32700UT-PLD-LAN-IRQ",
+       .name = "M32700UT-PLD-LAN-IRQ",
         .startup = startup_m32700ut_lanpld_irq,
         .shutdown = shutdown_m32700ut_lanpld_irq,
         .enable = enable_m32700ut_lanpld_irq,
@@ -286,7 +286,7 @@ static void shutdown_m32700ut_lcdpld_irq(unsigned int irq)
  
  static struct irq_chip m32700ut_lcdpld_irq_type =
  {
-       .typename = "M32700UT-PLD-LCD-IRQ",
+       .name = "M32700UT-PLD-LCD-IRQ",
         .startup = startup_m32700ut_lcdpld_irq,
         .shutdown = shutdown_m32700ut_lcdpld_irq,
         .enable = enable_m32700ut_lcdpld_irq,
diff --git a/arch/m32r/platforms/mappi/setup.c b/arch/m32r/platforms/mappi/setup.c

index fb4b177..ea00c84 100644 (file)
--- a/arch/m32r/platforms/mappi/setup.c
+++ b/arch/m32r/platforms/mappi/setup.c
@@ -65,7 +65,7 @@ static void shutdown_mappi_irq(unsigned int irq)
  
  static struct irq_chip mappi_irq_type =
  {
-       .typename = "MAPPI-IRQ",
+       .name = "MAPPI-IRQ",
         .startup = startup_mappi_irq,
         .shutdown = shutdown_mappi_irq,
         .enable = enable_mappi_irq,
diff --git a/arch/m32r/platforms/mappi2/setup.c b/arch/m32r/platforms/mappi2/setup.c

index 6a65eda..c049376 100644 (file)
--- a/arch/m32r/platforms/mappi2/setup.c
+++ b/arch/m32r/platforms/mappi2/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi2_irq(unsigned int irq)
  
  static struct irq_chip mappi2_irq_type =
  {
-       .typename = "MAPPI2-IRQ",
+       .name = "MAPPI2-IRQ",
         .startup = startup_mappi2_irq,
         .shutdown = shutdown_mappi2_irq,
         .enable = enable_mappi2_irq,
diff --git a/arch/m32r/platforms/mappi3/setup.c b/arch/m32r/platforms/mappi3/setup.c

index 9c337ae..882de25 100644 (file)
--- a/arch/m32r/platforms/mappi3/setup.c
+++ b/arch/m32r/platforms/mappi3/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi3_irq(unsigned int irq)
  
  static struct irq_chip mappi3_irq_type =
  {
-       .typename = "MAPPI3-IRQ",
+       .name = "MAPPI3-IRQ",
         .startup = startup_mappi3_irq,
         .shutdown = shutdown_mappi3_irq,
         .enable = enable_mappi3_irq,
diff --git a/arch/m32r/platforms/oaks32r/setup.c b/arch/m32r/platforms/oaks32r/setup.c

index ed86574..d11d93b 100644 (file)
--- a/arch/m32r/platforms/oaks32r/setup.c
+++ b/arch/m32r/platforms/oaks32r/setup.c
@@ -63,7 +63,7 @@ static void shutdown_oaks32r_irq(unsigned int irq)
  
  static struct irq_chip oaks32r_irq_type =
  {
-       .typename = "OAKS32R-IRQ",
+       .name = "OAKS32R-IRQ",
         .startup = startup_oaks32r_irq,
         .shutdown = shutdown_oaks32r_irq,
         .enable = enable_oaks32r_irq,
diff --git a/arch/m32r/platforms/opsput/setup.c b/arch/m32r/platforms/opsput/setup.c

index 80d6806..5f3402a 100644 (file)
--- a/arch/m32r/platforms/opsput/setup.c
+++ b/arch/m32r/platforms/opsput/setup.c
@@ -72,7 +72,7 @@ static void shutdown_opsput_irq(unsigned int irq)
  
  static struct irq_chip opsput_irq_type =
  {
-       .typename = "OPSPUT-IRQ",
+       .name = "OPSPUT-IRQ",
         .startup = startup_opsput_irq,
         .shutdown = shutdown_opsput_irq,
         .enable = enable_opsput_irq,
@@ -149,7 +149,7 @@ static void shutdown_opsput_pld_irq(unsigned int irq)
  
  static struct irq_chip opsput_pld_irq_type =
  {
-       .typename = "OPSPUT-PLD-IRQ",
+       .name = "OPSPUT-PLD-IRQ",
         .startup = startup_opsput_pld_irq,
         .shutdown = shutdown_opsput_pld_irq,
         .enable = enable_opsput_pld_irq,
@@ -218,7 +218,7 @@ static void shutdown_opsput_lanpld_irq(unsigned int irq)
  
  static struct irq_chip opsput_lanpld_irq_type =
  {
-       .typename = "OPSPUT-PLD-LAN-IRQ",
+       .name = "OPSPUT-PLD-LAN-IRQ",
         .startup = startup_opsput_lanpld_irq,
         .shutdown = shutdown_opsput_lanpld_irq,
         .enable = enable_opsput_lanpld_irq,
diff --git a/arch/m32r/platforms/usrv/setup.c b/arch/m32r/platforms/usrv/setup.c

index 7573026..1beac7a 100644 (file)
--- a/arch/m32r/platforms/usrv/setup.c
+++ b/arch/m32r/platforms/usrv/setup.c
@@ -63,7 +63,7 @@ static void shutdown_mappi_irq(unsigned int irq)
  
  static struct irq_chip mappi_irq_type =
  {
-       .typename = "M32700-IRQ",
+       .name = "M32700-IRQ",
         .startup = startup_mappi_irq,
         .shutdown = shutdown_mappi_irq,
         .enable = enable_mappi_irq,
@@ -136,7 +136,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
  
  static struct irq_chip m32700ut_pld_irq_type =
  {
-       .typename = "USRV-PLD-IRQ",
+       .name = "USRV-PLD-IRQ",
         .startup = startup_m32700ut_pld_irq,
         .shutdown = shutdown_m32700ut_pld_irq,
         .enable = enable_m32700ut_pld_irq,
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c

index 2340f11..9a526ba 100644 (file)
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p)
         if (retval)
                 goto out_unlock;
  
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h

index 6c294ac..9c3d160 100644 (file)
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
  
  #define PTRRELOC(x)    ((typeof(x)) add_reloc_offset((unsigned long)(x)))
  
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void account_system_vtime(struct task_struct *);
-#endif
-
  extern struct dentry *powerpc_debugfs_root;
  
  #endif /* __KERNEL__ */
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c

index 9708553..e3e379c 100644 (file)
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -310,9 +310,9 @@ static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
  }
  
  static struct irq_chip msic_irq_chip = {
-       .mask           = mask_msi_irq,
-       .unmask         = unmask_msi_irq,
-       .shutdown       = unmask_msi_irq,
+       .irq_mask       = mask_msi_irq,
+       .irq_unmask     = unmask_msi_irq,
+       .irq_shutdown   = mask_msi_irq,
         .name           = "AXON-MSI",
  };
  
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c

index 93834b0..67e2c4b 100644 (file)
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -243,7 +243,7 @@ static unsigned int xics_startup(unsigned int virq)
          * at that level, so we do it here by hand.
          */
         if (irq_to_desc(virq)->msi_desc)
-               unmask_msi_irq(virq);
+               unmask_msi_irq(irq_get_irq_data(virq));
  
         /* unmask it */
         xics_unmask_irq(virq);
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c

index 87991d3..bdbd896 100644 (file)
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -51,8 +51,8 @@ static void fsl_msi_end_irq(unsigned int virq)
  }
  
  static struct irq_chip fsl_msi_chip = {
-       .mask           = mask_msi_irq,
-       .unmask         = unmask_msi_irq,
+       .irq_mask       = mask_msi_irq,
+       .irq_unmask     = unmask_msi_irq,
         .ack            = fsl_msi_end_irq,
         .name           = "FSL-MSI",
  };
diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/sysdev/mpic_pasemi_msi.c

index 3b6a9a4..320ad5a 100644 (file)
--- a/arch/powerpc/sysdev/mpic_pasemi_msi.c
+++ b/arch/powerpc/sysdev/mpic_pasemi_msi.c
@@ -39,24 +39,24 @@
  static struct mpic *msi_mpic;
  
  
-static void mpic_pasemi_msi_mask_irq(unsigned int irq)
+static void mpic_pasemi_msi_mask_irq(struct irq_data *data)
  {
-       pr_debug("mpic_pasemi_msi_mask_irq %d\n", irq);
-       mask_msi_irq(irq);
-       mpic_mask_irq(irq);
+       pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq);
+       mask_msi_irq(data);
+       mpic_mask_irq(data->irq);
  }
  
-static void mpic_pasemi_msi_unmask_irq(unsigned int irq)
+static void mpic_pasemi_msi_unmask_irq(struct irq_data *data)
  {
-       pr_debug("mpic_pasemi_msi_unmask_irq %d\n", irq);
-       mpic_unmask_irq(irq);
-       unmask_msi_irq(irq);
+       pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq);
+       mpic_unmask_irq(data->irq);
+       unmask_msi_irq(data);
  }
  
  static struct irq_chip mpic_pasemi_msi_chip = {
-       .shutdown       = mpic_pasemi_msi_mask_irq,
-       .mask           = mpic_pasemi_msi_mask_irq,
-       .unmask         = mpic_pasemi_msi_unmask_irq,
+       .irq_shutdown   = mpic_pasemi_msi_mask_irq,
+       .irq_mask       = mpic_pasemi_msi_mask_irq,
+       .irq_unmask     = mpic_pasemi_msi_unmask_irq,
         .eoi            = mpic_end_irq,
         .set_type       = mpic_set_irq_type,
         .set_affinity   = mpic_set_affinity,
diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c

index bcbfe79..a2b028b 100644 (file)
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -23,22 +23,22 @@
  /* A bit ugly, can we get this from the pci_dev somehow? */
  static struct mpic *msi_mpic;
  
-static void mpic_u3msi_mask_irq(unsigned int irq)
+static void mpic_u3msi_mask_irq(struct irq_data *data)
  {
-       mask_msi_irq(irq);
-       mpic_mask_irq(irq);
+       mask_msi_irq(data);
+       mpic_mask_irq(data->irq);
  }
  
-static void mpic_u3msi_unmask_irq(unsigned int irq)
+static void mpic_u3msi_unmask_irq(struct irq_data *data)
  {
-       mpic_unmask_irq(irq);
-       unmask_msi_irq(irq);
+       mpic_unmask_irq(data->irq);
+       unmask_msi_irq(data);
  }
  
  static struct irq_chip mpic_u3msi_chip = {
-       .shutdown       = mpic_u3msi_mask_irq,
-       .mask           = mpic_u3msi_mask_irq,
-       .unmask         = mpic_u3msi_unmask_irq,
+       .irq_shutdown   = mpic_u3msi_mask_irq,
+       .irq_mask       = mpic_u3msi_mask_irq,
+       .irq_unmask     = mpic_u3msi_unmask_irq,
         .eoi            = mpic_end_irq,
         .set_type       = mpic_set_irq_type,
         .set_affinity   = mpic_set_affinity,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 958f0da..75976a1 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,6 +199,13 @@ config HOTPLUG_CPU
           can be controlled through /sys/devices/system/cpu/cpu#.
           Say N if you want to disable CPU hotplug.
  
+config SCHED_BOOK
+       bool "Book scheduler support"
+       depends on SMP
+       help
+         Book scheduler support improves the CPU scheduler's decision making
+         when dealing with machines that have several books.
+
  config MATHEMU
         bool "IEEE FPU emulation"
         depends on MARCH_G5
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h

index 498bc38..881d945 100644 (file)
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
  #ifndef __ASM_HARDIRQ_H
  #define __ASM_HARDIRQ_H
  
-#include <linux/threads.h>
-#include <linux/sched.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
  #include <asm/lowcore.h>
  
  #define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h

index cef6621..38ddd8a 100644 (file)
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
  
  extern void account_vtime(struct task_struct *, struct task_struct *);
  extern void account_tick_vtime(struct task_struct *);
-extern void account_system_vtime(struct task_struct *);
  
  #ifdef CONFIG_PFAULT
  extern void pfault_irq_init(void);
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h

index 831bd03..051107a 100644 (file)
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
  
  #include <linux/cpumask.h>
  
-#define mc_capable()   (1)
-
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
-
  extern unsigned char cpu_core_id[NR_CPUS];
  extern cpumask_t cpu_core_map[NR_CPUS];
  
+static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+       return &cpu_core_map[cpu];
+}
+
  #define topology_core_id(cpu)          (cpu_core_id[cpu])
  #define topology_core_cpumask(cpu)     (&cpu_core_map[cpu])
+#define mc_capable()                   (1)
+
+#ifdef CONFIG_SCHED_BOOK
+
+extern unsigned char cpu_book_id[NR_CPUS];
+extern cpumask_t cpu_book_map[NR_CPUS];
+
+static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
+{
+       return &cpu_book_map[cpu];
+}
+
+#define topology_book_id(cpu)          (cpu_book_id[cpu])
+#define topology_book_cpumask(cpu)     (&cpu_book_map[cpu])
+
+#endif /* CONFIG_SCHED_BOOK */
  
  int topology_set_cpu_management(int fc);
  void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
  };
  #endif
  
+#define SD_BOOK_INIT   SD_CPU_INIT
+
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c

index bcef007..13559c9 100644 (file)
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
         union tl_entry tle[0];
  };
  
-struct core_info {
-       struct core_info *next;
+struct mask_info {
+       struct mask_info *next;
         unsigned char id;
         cpumask_t mask;
  };
@@ -66,7 +66,6 @@ struct core_info {
  static int topology_enabled;
  static void topology_work_fn(struct work_struct *work);
  static struct tl_info *tl_info;
-static struct core_info core_info;
  static int machine_has_topology;
  static struct timer_list topology_timer;
  static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
  /* topology_lock protects the core linked list */
  static DEFINE_SPINLOCK(topology_lock);
  
+static struct mask_info core_info;
  cpumask_t cpu_core_map[NR_CPUS];
  unsigned char cpu_core_id[NR_CPUS];
  
-static cpumask_t cpu_coregroup_map(unsigned int cpu)
+#ifdef CONFIG_SCHED_BOOK
+static struct mask_info book_info;
+cpumask_t cpu_book_map[NR_CPUS];
+unsigned char cpu_book_id[NR_CPUS];
+#endif
+
+static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
  {
-       struct core_info *core = &core_info;
-       unsigned long flags;
         cpumask_t mask;
  
         cpus_clear(mask);
         if (!topology_enabled || !machine_has_topology)
                 return cpu_possible_map;
-       spin_lock_irqsave(&topology_lock, flags);
-       while (core) {
-               if (cpu_isset(cpu, core->mask)) {
-                       mask = core->mask;
+       while (info) {
+               if (cpu_isset(cpu, info->mask)) {
+                       mask = info->mask;
                         break;
                 }
-               core = core->next;
+               info = info->next;
         }
-       spin_unlock_irqrestore(&topology_lock, flags);
         if (cpus_empty(mask))
                 mask = cpumask_of_cpu(cpu);
         return mask;
  }
  
-const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
-{
-       return &cpu_core_map[cpu];
-}
-
-static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
+static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
+                            struct mask_info *core)
  {
         unsigned int cpu;
  
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
  
                 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
                 for_each_present_cpu(lcpu) {
-                       if (cpu_logical_map(lcpu) == rcpu) {
-                               cpu_set(lcpu, core->mask);
-                               cpu_core_id[lcpu] = core->id;
-                               smp_cpu_polarization[lcpu] = tl_cpu->pp;
-                       }
+                       if (cpu_logical_map(lcpu) != rcpu)
+                               continue;
+#ifdef CONFIG_SCHED_BOOK
+                       cpu_set(lcpu, book->mask);
+                       cpu_book_id[lcpu] = book->id;
+#endif
+                       cpu_set(lcpu, core->mask);
+                       cpu_core_id[lcpu] = core->id;
+                       smp_cpu_polarization[lcpu] = tl_cpu->pp;
                 }
         }
  }
  
-static void clear_cores(void)
+static void clear_masks(void)
  {
-       struct core_info *core = &core_info;
+       struct mask_info *info;
  
-       while (core) {
-               cpus_clear(core->mask);
-               core = core->next;
+       info = &core_info;
+       while (info) {
+               cpus_clear(info->mask);
+               info = info->next;
+       }
+#ifdef CONFIG_SCHED_BOOK
+       info = &book_info;
+       while (info) {
+               cpus_clear(info->mask);
+               info = info->next;
         }
+#endif
  }
  
  static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
  
  static void tl_to_cores(struct tl_info *info)
  {
+#ifdef CONFIG_SCHED_BOOK
+       struct mask_info *book = &book_info;
+#else
+       struct mask_info *book = NULL;
+#endif
+       struct mask_info *core = &core_info;
         union tl_entry *tle, *end;
-       struct core_info *core = &core_info;
+
  
         spin_lock_irq(&topology_lock);
-       clear_cores();
+       clear_masks();
         tle = info->tle;
         end = (union tl_entry *)((unsigned long)info + info->length);
         while (tle < end) {
                 switch (tle->nl) {
-               case 5:
-               case 4:
-               case 3:
+#ifdef CONFIG_SCHED_BOOK
                 case 2:
+                       book = book->next;
+                       book->id = tle->container.id;
                         break;
+#endif
                 case 1:
                         core = core->next;
                         core->id = tle->container.id;
                         break;
                 case 0:
-                       add_cpus_to_core(&tle->cpu, core);
+                       add_cpus_to_mask(&tle->cpu, book, core);
                         break;
                 default:
-                       clear_cores();
+                       clear_masks();
                         machine_has_topology = 0;
                         goto out;
                 }
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
  
  static void update_cpu_core_map(void)
  {
+       unsigned long flags;
         int cpu;
  
-       for_each_possible_cpu(cpu)
-               cpu_core_map[cpu] = cpu_coregroup_map(cpu);
+       spin_lock_irqsave(&topology_lock, flags);
+       for_each_possible_cpu(cpu) {
+               cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
+#ifdef CONFIG_SCHED_BOOK
+               cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
+#endif
+       }
+       spin_unlock_irqrestore(&topology_lock, flags);
+}
+
+static void store_topology(struct tl_info *info)
+{
+#ifdef CONFIG_SCHED_BOOK
+       int rc;
+
+       rc = stsi(info, 15, 1, 3);
+       if (rc != -ENOSYS)
+               return;
+#endif
+       stsi(info, 15, 1, 2);
  }
  
  int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
                 topology_update_polarization_simple();
                 return 0;
         }
-       stsi(info, 15, 1, 2);
+       store_topology(info);
         tl_to_cores(info);
         update_cpu_core_map();
         for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
  }
  __initcall(init_topology_update);
  
+static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
+{
+       int i, nr_masks;
+
+       nr_masks = info->mag[NR_MAG - offset];
+       for (i = 0; i < info->mnest - offset; i++)
+               nr_masks *= info->mag[NR_MAG - offset - 1 - i];
+       nr_masks = max(nr_masks, 1);
+       for (i = 0; i < nr_masks; i++) {
+               mask->next = alloc_bootmem(sizeof(struct mask_info));
+               mask = mask->next;
+       }
+}
+
  void __init s390_init_cpu_topology(void)
  {
         unsigned long long facility_bits;
         struct tl_info *info;
-       struct core_info *core;
-       int nr_cores;
         int i;
  
         if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
  
         tl_info = alloc_bootmem_pages(PAGE_SIZE);
         info = tl_info;
-       stsi(info, 15, 1, 2);
-
-       nr_cores = info->mag[NR_MAG - 2];
-       for (i = 0; i < info->mnest - 2; i++)
-               nr_cores *= info->mag[NR_MAG - 3 - i];
-
+       store_topology(info);
         pr_info("The CPU configuration topology of the machine is:");
         for (i = 0; i < NR_MAG; i++)
                 printk(" %d", info->mag[i]);
         printk(" / %d\n", info->mnest);
-
-       core = &core_info;
-       for (i = 0; i < nr_cores; i++) {
-               core->next = alloc_bootmem(sizeof(struct core_info));
-               core = core->next;
-               if (!core)
-                       goto error;
-       }
-       return;
-error:
-       machine_has_topology = 0;
+       alloc_masks(info, &core_info, 2);
+#ifdef CONFIG_SCHED_BOOK
+       alloc_masks(info, &book_info, 3);
+#endif
  }
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c

index 257de1f..ae5bac3 100644 (file)
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -290,7 +290,7 @@ void __init init_IRQ(void)
  int __init arch_probe_nr_irqs(void)
  {
         nr_irqs = sh_mv.mv_nr_irqs;
-       return 0;
+       return NR_IRQS_LEGACY;
  }
  #endif
  
diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c

index 548b8ca..b210416 100644 (file)
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -114,10 +114,10 @@ static void free_msi(struct pci_pbm_info *pbm, int msi_num)
  
  static struct irq_chip msi_irq = {
         .name           = "PCI-MSI",
-       .mask           = mask_msi_irq,
-       .unmask         = unmask_msi_irq,
-       .enable         = unmask_msi_irq,
-       .disable        = mask_msi_irq,
+       .irq_mask       = mask_msi_irq,
+       .irq_unmask     = unmask_msi_irq,
+       .irq_enable     = unmask_msi_irq,
+       .irq_disable    = mask_msi_irq,
         /* XXX affinity XXX */
  };
  
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c

index 596c600..9a27d56 100644 (file)
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -208,7 +208,7 @@ static void tile_irq_chip_eoi(unsigned int irq)
  }
  
  static struct irq_chip tile_irq_chip = {
-       .typename = "tile_irq_chip",
+       .name = "tile_irq_chip",
         .ack = tile_irq_chip_ack,
         .eoi = tile_irq_chip_eoi,
         .mask = tile_irq_chip_mask,
@@ -288,7 +288,7 @@ int show_interrupts(struct seq_file *p, void *v)
                 for_each_online_cpu(j)
                         seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
  #endif
-               seq_printf(p, " %14s", irq_desc[i].chip->typename);
+               seq_printf(p, " %14s", irq_desc[i].chip->name);
                 seq_printf(p, "  %s", action->name);
  
                 for (action = action->next; action; action = action->next)
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c

index a3f0b04..a746e30 100644 (file)
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -46,7 +46,7 @@ int show_interrupts(struct seq_file *p, void *v)
                 for_each_online_cpu(j)
                         seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
  #endif
-               seq_printf(p, " %14s", irq_desc[i].chip->typename);
+               seq_printf(p, " %14s", irq_desc[i].chip->name);
                 seq_printf(p, "  %s", action->name);
  
                 for (action=action->next; action; action = action->next)
@@ -369,7 +369,7 @@ static void dummy(unsigned int irq)
  
  /* This is used for everything else than the timer. */
  static struct irq_chip normal_irq_type = {
-       .typename = "SIGIO",
+       .name = "SIGIO",
         .release = free_irq_by_irq_and_dev,
         .disable = dummy,
         .enable = dummy,
@@ -378,7 +378,7 @@ static struct irq_chip normal_irq_type = {
  };
  
  static struct irq_chip SIGVTALRM_irq_type = {
-       .typename = "SIGVTALRM",
+       .name = "SIGVTALRM",
         .release = free_irq_by_irq_and_dev,
         .shutdown = dummy, /* never called */
         .disable = dummy,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index fd227d6..7ab9db8 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,10 @@ config X86
         select HAVE_USER_RETURN_NOTIFIER
         select HAVE_ARCH_JUMP_LABEL
         select HAVE_TEXT_POKE_SMP
+       select HAVE_GENERIC_HARDIRQS
+       select HAVE_SPARSE_IRQ
+       select GENERIC_IRQ_PROBE
+       select GENERIC_PENDING_IRQ if SMP
  
  config INSTRUCTION_DECODER
         def_bool (KPROBES || PERF_EVENTS)
@@ -204,20 +208,6 @@ config HAVE_INTEL_TXT
         def_bool y
         depends on EXPERIMENTAL && DMAR && ACPI
  
-# Use the generic interrupt handling code in kernel/irq/:
-config GENERIC_HARDIRQS
-       def_bool y
-
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
-
-config GENERIC_IRQ_PROBE
-       def_bool y
-
-config GENERIC_PENDING_IRQ
-       def_bool y
-       depends on GENERIC_HARDIRQS && SMP
-
  config USE_GENERIC_SMP_HELPERS
         def_bool y
         depends on SMP
@@ -300,23 +290,6 @@ config X86_X2APIC
  
           If you don't know what to do here, say N.
  
-config SPARSE_IRQ
-       bool "Support sparse irq numbering"
-       depends on PCI_MSI || HT_IRQ
-       ---help---
-         This enables support for sparse irqs. This is useful for distro
-         kernels that want to define a high CONFIG_NR_CPUS value but still
-         want to have low kernel memory footprint on smaller machines.
-
-         ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
-           out the irq_desc[] array in a more NUMA-friendly way. )
-
-         If you don't know what to do here, say N.
-
-config NUMA_IRQ_DESC
-       def_bool y
-       depends on SPARSE_IRQ && NUMA
-
  config X86_MPPARSE
         bool "Enable MPS table" if ACPI
         default y
@@ -521,25 +494,6 @@ if PARAVIRT_GUEST
  
  source "arch/x86/xen/Kconfig"
  
-config VMI
-       bool "VMI Guest support (DEPRECATED)"
-       select PARAVIRT
-       depends on X86_32
-       ---help---
-         VMI provides a paravirtualized interface to the VMware ESX server
-         (it could be used by other hypervisors in theory too, but is not
-         at the moment), by linking the kernel to a GPL-ed ROM module
-         provided by the hypervisor.
-
-         As of September 2009, VMware has started a phased retirement
-         of this feature from VMware's products. Please see
-         feature-removal-schedule.txt for details.  If you are
-         planning to enable this option, please note that you cannot
-         live migrate a VMI enabled VM to a future VMware product,
-         which doesn't support VMI. So if you expect your kernel to
-         seamlessly migrate to newer VMware products, keep this
-         disabled.
-
  config KVM_CLOCK
         bool "KVM paravirtualized clock"
         select PARAVIRT
@@ -674,7 +628,7 @@ config GART_IOMMU
         bool "GART IOMMU support" if EMBEDDED
         default y
         select SWIOTLB
-       depends on X86_64 && PCI && K8_NB
+       depends on X86_64 && PCI && AMD_NB
         ---help---
           Support for full DMA access of devices with 32bit memory access only
           on systems with more than 3GB. This is usually needed for USB,
@@ -799,6 +753,17 @@ config SCHED_MC
           making when dealing with multi-core CPU chips at a cost of slightly
           increased overhead in some places. If unsure say N here.
  
+config IRQ_TIME_ACCOUNTING
+       bool "Fine granularity task level IRQ time accounting"
+       default n
+       ---help---
+         Select this option to enable fine granularity task irq time
+         accounting. This is done by reading a timestamp on each
+         transitions between softirq and hardirq state, so there can be a
+         small performance impact.
+
+         If in doubt, say N here.
+
  source "kernel/Kconfig.preempt"
  
  config X86_UP_APIC
@@ -1152,6 +1117,9 @@ config X86_PAE
  config ARCH_PHYS_ADDR_T_64BIT
         def_bool X86_64 || X86_PAE
  
+config ARCH_DMA_ADDR_T_64BIT
+       def_bool X86_64 || HIGHMEM64G
+
  config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
         default y
@@ -1330,25 +1298,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
           Set whether the default state of memory_corruption_check is
           on or off.
  
-config X86_RESERVE_LOW_64K
-       bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
-       default y
+config X86_RESERVE_LOW
+       int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+       default 64
+       range 4 640
         ---help---
-         Reserve the first 64K of physical RAM on BIOSes that are known
-         to potentially corrupt that memory range. A numbers of BIOSes are
-         known to utilize this area during suspend/resume, so it must not
-         be used by the kernel.
+         Specify the amount of low memory to reserve for the BIOS.
+
+         The first page contains BIOS data structures that the kernel
+         must not use, so that page must always be reserved.
  
-         Set this to N if you are absolutely sure that you trust the BIOS
-         to get all its memory reservations and usages right.
+         By default we reserve the first 64K of physical RAM, as a
+         number of BIOSes are known to corrupt that memory range
+         during events such as suspend/resume or monitor cable
+         insertion, so it must not be used by the kernel.
  
-         If you have doubts about the BIOS (e.g. suspend/resume does not
-         work or there's kernel crashes after certain hardware hotplug
-         events) and it's not AMI or Phoenix, then you might want to enable
-         X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
-         corruption patterns.
+         You can set this to 4 if you are absolutely sure that you
+         trust the BIOS to get all its memory reservations and usages
+         right.  If you know your BIOS have problems beyond the
+         default 64K area, you can set this to 640 to avoid using the
+         entire low memory range.
  
-         Say Y if unsure.
+         If you have doubts about the BIOS (e.g. suspend/resume does
+         not work or there's kernel crashes after certain hardware
+         hotplug events) then you might want to enable
+         X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
+         typical corruption patterns.
+
+         Leave this to the default value of 64 if you are unsure.
  
  config MATH_EMULATION
         bool
@@ -1904,7 +1881,7 @@ config PCI_GODIRECT
         bool "Direct"
  
  config PCI_GOOLPC
-       bool "OLPC"
+       bool "OLPC XO-1"
         depends on OLPC
  
  config PCI_GOANY
@@ -2065,14 +2042,21 @@ config SCx200HR_TIMER
  config OLPC
         bool "One Laptop Per Child support"
         select GPIOLIB
+       select OLPC_OPENFIRMWARE
         ---help---
           Add support for detecting the unique features of the OLPC
           XO hardware.
  
+config OLPC_XO1
+       tristate "OLPC XO-1 support"
+       depends on OLPC && PCI
+       ---help---
+         Add support for non-essential features of the OLPC XO-1 laptop.
+
  config OLPC_OPENFIRMWARE
         bool "Support for OLPC's Open Firmware"
         depends on !X86_64 && !X86_PAE
-       default y if OLPC
+       default n
         help
           This option adds support for the implementation of Open Firmware
           that is used on the OLPC XO-1 Children's Machine.
@@ -2080,7 +2064,7 @@ config OLPC_OPENFIRMWARE
  
  endif # X86_32
  
-config K8_NB
+config AMD_NB
         def_bool y
         depends on CPU_SUP_AMD && PCI
  
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index 7508508..e5bb96b 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
           with klogd/syslogd or the X server. You should normally N here,
           unless you want to debug such a crash.
  
+config EARLY_PRINTK_MRST
+       bool "Early printk for MRST platform support"
+       depends on EARLY_PRINTK && X86_MRST
+
  config EARLY_PRINTK_DBGP
         bool "Early printk via EHCI debug port"
         depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile

index e8c8881..b02e509 100644 (file)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
  # is .cfi_signal_frame supported too?
  cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
  cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+
+# does binutils support specific instructions?
+asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
  
  LDFLAGS := -m elf_$(UTS_MACHINE)
  
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h

index 5af2982..f16a2ca 100644 (file)
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h

index cb03037..916bc81 100644 (file)
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *
   * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h

index 0861618..e3509fc 100644 (file)
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
@@ -416,13 +416,22 @@ struct amd_iommu {
         struct dma_ops_domain *default_dom;
  
         /*
-        * This array is required to work around a potential BIOS bug.
-        * The BIOS may miss to restore parts of the PCI configuration
-        * space when the system resumes from S3. The result is that the
-        * IOMMU does not execute commands anymore which leads to system
-        * failure.
+        * We can't rely on the BIOS to restore all values on reinit, so we
+        * need to stash them
          */
-       u32 cache_cfg[4];
+
+       /* The iommu BAR */
+       u32 stored_addr_lo;
+       u32 stored_addr_hi;
+
+       /*
+        * Each iommu has 6 l1s, each of which is documented as having 0x12
+        * registers
+        */
+       u32 stored_l1[6][0x12];
+
+       /* The l2 indirect registers */
+       u32 stored_l2[0x83];
  };
  
  /*
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h

new file mode 100644 (file)

index 0000000..c8517f8
--- /dev/null
+++ b/arch/x86/include/asm/amd_nb.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_AMD_NB_H
+#define _ASM_X86_AMD_NB_H
+
+#include <linux/pci.h>
+
+extern struct pci_device_id k8_nb_ids[];
+struct bootnode;
+
+extern int early_is_k8_nb(u32 value);
+extern int cache_k8_northbridges(void);
+extern void k8_flush_garts(void);
+extern int k8_get_nodes(struct bootnode *nodes);
+extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int k8_scan_nodes(void);
+
+struct k8_northbridge_info {
+       u16 num;
+       u8 gart_supported;
+       struct pci_dev **nb_misc;
+};
+extern struct k8_northbridge_info k8_northbridges;
+
+#ifdef CONFIG_AMD_NB
+
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+}
+
+#else
+
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return NULL;
+}
+#endif
+
+
+#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h

index a69b1ac..2fefa50 100644 (file)
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
  extern unsigned long apbt_quick_calibrate(void);
  extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
  extern void apbt_setup_secondary_clock(void);
-extern unsigned int boot_cpu_id;
  
  extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
  extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index 1fa03e0..286de34 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
  }
  #endif
  
-extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
-extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
-
+extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
  
  #else /* !CONFIG_X86_LOCAL_APIC */
  static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h

index 7fe3b30..a859ca4 100644 (file)
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
  #define APIC_EILVTn(n) (0x500 + 0x10 * n)
  #define                APIC_EILVT_NR_AMD_K8    1       /* # of extended interrupts */
  #define                APIC_EILVT_NR_AMD_10H   4
+#define                APIC_EILVT_NR_MAX       APIC_EILVT_NR_AMD_10H
  #define                APIC_EILVT_LVTOFF(x)    (((x) >> 4) & 0xF)
  #define                APIC_EILVT_MSG_FIX      0x0
  #define                APIC_EILVT_MSG_SMI      0x2
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h

index b185091..4fab24d 100644 (file)
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
  
  DECLARE_PER_CPU(int, cpu_state);
  
-extern unsigned int boot_cpu_id;
  
  #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

index 3f76523..220e2ea 100644 (file)
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
  #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
  #define X86_FEATURE_OSVW       (6*32+ 9) /* OS Visible Workaround */
  #define X86_FEATURE_IBS                (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_SSE5       (6*32+11) /* SSE-5 */
+#define X86_FEATURE_XOP                (6*32+11) /* extended AVX instructions */
  #define X86_FEATURE_SKINIT     (6*32+12) /* SKINIT/STGI instructions */
  #define X86_FEATURE_WDT                (6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP                (6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4       (6*32+16) /* 4 operands MAC instructions */
  #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM                (6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT    (6*32+22) /* topology extensions CPUID leafs */
  
  /*
   * Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
  #define X86_FEATURE_LBRV       (8*32+ 6) /* AMD LBR Virtualization support */
  #define X86_FEATURE_SVML       (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
  #define X86_FEATURE_NRIPS      (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+
  
  /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
  #define X86_FEATURE_FSGSBASE   (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h

index 733f7e9..3260991 100644 (file)
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
         CFI_ADJUST_CFA_OFFSET -8
         .endm
  
+       .macro pushfq_cfi
+       pushfq
+       CFI_ADJUST_CFA_OFFSET 8
+       .endm
+
+       .macro popfq_cfi
+       popfq
+       CFI_ADJUST_CFA_OFFSET -8
+       .endm
+
         .macro movq_cfi reg offset=0
         movq %\reg, \offset(%rsp)
         CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
         CFI_ADJUST_CFA_OFFSET -4
         .endm
  
+       .macro pushfl_cfi
+       pushfl
+       CFI_ADJUST_CFA_OFFSET 4
+       .endm
+
+       .macro popfl_cfi
+       popfl
+       CFI_ADJUST_CFA_OFFSET -4
+       .endm
+
         .macro movl_cfi reg offset=0
         movl %\reg, \offset(%esp)
         CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h

index d07b44f..4d293dc 100644 (file)
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
         BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
         return __virt_to_fix(vaddr);
  }
+
+/* Return an pointer with offset calculated */
+static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
+                               phys_addr_t phys, pgprot_t flags)
+{
+       __set_fixmap(idx, phys, flags);
+       return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys)                   \
+       __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys)                   \
+       __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
  #endif /* !__ASSEMBLY__ */
  #endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h

index 4ac5b0f..bf357f9 100644 (file)
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
  #define GARTEN         (1<<0)
  #define DISGARTCPU     (1<<4)
  #define DISGARTIO      (1<<5)
+#define DISTLBWALKPRB  (1<<6)
  
  /* GART cache control register bits. */
  #define INVGART                (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
  #define AMD64_GARTAPERTUREBASE 0x94
  #define AMD64_GARTTABLEBASE    0x98
  #define AMD64_GARTCACHECTL     0x9c
-#define AMD64_GARTEN           (1<<0)
  
  #ifdef CONFIG_GART_IOMMU
  extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
  
  extern int agp_amd64_init(void);
  
+static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
+{
+       u32 ctl;
+
+       /*
+        * Don't enable translation but enable GART IO and CPU accesses.
+        * Also, set DISTLBWALKPRB since GART tables memory is UC.
+        */
+       ctl = DISTLBWALKPRB | order << 1;
+
+       pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+}
+
  static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
  {
         u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h

index 1d5c08a..2c392d6 100644 (file)
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
  extern unsigned int hpet_readl(unsigned int a);
  extern void force_hpet_resume(void);
  
-extern void hpet_msi_unmask(unsigned int irq);
-extern void hpet_msi_mask(unsigned int irq);
-extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
-extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
+struct irq_data;
+extern void hpet_msi_unmask(struct irq_data *data);
+extern void hpet_msi_mask(struct irq_data *data);
+struct hpet_dev;
+extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
  
  #ifdef CONFIG_PCI_MSI
  extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index 3a54a1c..0274ec5 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
         irq_attr->polarity      = polarity;
  }
  
+struct irq_2_iommu {
+       struct intel_iommu *iommu;
+       u16 irte_index;
+       u16 sub_handle;
+       u8  irte_mask;
+};
+
  /*
   * This is performance-critical, we want to do it O(1)
   *
@@ -89,15 +96,17 @@ struct irq_cfg {
         cpumask_var_t           old_domain;
         u8                      vector;
         u8                      move_in_progress : 1;
+#ifdef CONFIG_INTR_REMAP
+       struct irq_2_iommu      irq_2_iommu;
+#endif
  };
  
-extern struct irq_cfg *irq_cfg(unsigned int);
  extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
  extern void send_cleanup_vector(struct irq_cfg *);
  
-struct irq_desc;
-extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
-                                     unsigned int *dest_id);
+struct irq_data;
+int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
+                         unsigned int *dest_id);
  extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
  extern void setup_ioapic_dest(void);
  
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h

index a73a8d5..4aa2bb3 100644 (file)
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
  extern int restore_i387_xstate_ia32(void __user *buf);
  #endif
  
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
  #define X87_FSW_ES (1 << 7)    /* Exception Summary */
  
  static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
         return static_cpu_has(X86_FEATURE_XSAVE);
  }
  
+static __always_inline __pure bool use_fxsr(void)
+{
+        return static_cpu_has(X86_FEATURE_FXSR);
+}
+
  extern void __sanitize_i387_state(struct task_struct *);
  
  static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
  }
  
  #ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
-       asm volatile("1: fwait\n"
-                    "2:\n"
-                    _ASM_EXTABLE(1b, 2b));
-}
-
  static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  {
         int err;
  
+       /* See comment in fxsave() below. */
         asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
                      "2:\n"
                      ".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                      ".previous\n"
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
-                    : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
-                    : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+                    : [fx] "R" (fx), "m" (*fx), "0" (0));
         return err;
  }
  
-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
-   is pending. Clear the x87 state here by setting it to fixed
-   values. The kernel data segment can be sometimes 0 and sometimes
-   new user value. Both should be ok.
-   Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
-       struct xsave_struct *xstate = &fpu->state->xsave;
-       struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
-       /*
-        * xsave header may indicate the init state of the FP.
-        */
-       if (use_xsave() &&
-           !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-               return;
-
-       if (unlikely(fx->swd & X87_FSW_ES))
-               asm volatile("fnclex");
-       alternative_input(ASM_NOP8 ASM_NOP2,
-                         "    emms\n"          /* clear stack tags */
-                         "    fildl %%gs:0",   /* load to clear state */
-                         X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
-       fpu_clear(&tsk->thread.fpu);
-}
-
  static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
  {
         int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
         if (unlikely(err))
                 return -EFAULT;
  
+       /* See comment in fxsave() below. */
         asm volatile("1:  rex64/fxsave (%[fx])\n\t"
                      "2:\n"
                      ".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                      ".previous\n"
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
-                    : [fx] "r" (fx), "0" (0));
-#else
-                    : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+                    : [fx] "R" (fx), "0" (0));
         if (unlikely(err) &&
             __clear_user(fx, sizeof(struct i387_fxsave_struct)))
                 err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
            uses any extended registers for addressing, a second REX prefix
            will be generated (to the assembler, rex64 followed by semicolon
            is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
+
+#ifdef CONFIG_AS_FXSAVEQ
         /* Using "fxsaveq %0" would be the ideal choice, but is only supported
            starting with gas 2.16. */
         __asm__ __volatile__("fxsaveq %0"
                              : "=m" (fpu->state->fxsave));
-#elif 0
+#else
         /* Using, as a workaround, the properly prefixed form below isn't
            accepted by any binutils version so far released, complaining that
            the same type of prefix is used twice if an extended register is
-          needed for addressing (fix submitted to mainline 2005-11-21). */
-       __asm__ __volatile__("rex64/fxsave %0"
-                            : "=m" (fpu->state->fxsave));
-#else
-       /* This, however, we can work around by forcing the compiler to select
+          needed for addressing (fix submitted to mainline 2005-11-21).
+       asm volatile("rex64/fxsave %0"
+                    : "=m" (fpu->state->fxsave));
+          This, however, we can work around by forcing the compiler to select
            an addressing mode that doesn't require extended registers. */
-       __asm__ __volatile__("rex64/fxsave (%1)"
-                            : "=m" (fpu->state->fxsave)
-                            : "cdaSDb" (&fpu->state->fxsave));
+       asm volatile("rex64/fxsave (%[fx])"
+                    : "=m" (fpu->state->fxsave)
+                    : [fx] "R" (&fpu->state->fxsave));
  #endif
  }
  
-static inline void fpu_save_init(struct fpu *fpu)
-{
-       if (use_xsave())
-               fpu_xsave(fpu);
-       else
-               fpu_fxsave(fpu);
-
-       fpu_clear(fpu);
-}
-
-static inline void __save_init_fpu(struct task_struct *tsk)
-{
-       fpu_save_init(&tsk->thread.fpu);
-       task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
  #else  /* CONFIG_X86_32 */
  
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-static inline void tolerant_fwait(void)
-{
-       asm volatile("fnclex ; fwait");
-}
-
  /* perform fxrstor iff the processor has extended states, otherwise frstor */
  static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  {
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
         return 0;
  }
  
+static inline void fpu_fxsave(struct fpu *fpu)
+{
+       asm volatile("fxsave %[fx]"
+                    : [fx] "=m" (fpu->state->fxsave));
+}
+
+#endif /* CONFIG_X86_64 */
+
  /* We need a safe address that is cheap to find and that is already
     in L1 during context switch. The best choices are unfortunately
     different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  static inline void fpu_save_init(struct fpu *fpu)
  {
         if (use_xsave()) {
-               struct xsave_struct *xstate = &fpu->state->xsave;
-               struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
                 fpu_xsave(fpu);
  
                 /*
                  * xsave header may indicate the init state of the FP.
                  */
-               if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
-                       goto end;
-
-               if (unlikely(fx->swd & X87_FSW_ES))
-                       asm volatile("fnclex");
-
-               /*
-                * we can do a simple return here or be paranoid :)
-                */
-               goto clear_state;
+               if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
+                       return;
+       } else if (use_fxsr()) {
+               fpu_fxsave(fpu);
+       } else {
+               asm volatile("fsave %[fx]; fwait"
+                            : [fx] "=m" (fpu->state->fsave));
+               return;
         }
  
-       /* Use more nops than strictly needed in case the compiler
-          varies code */
-       alternative_input(
-               "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
-               "fxsave %[fx]\n"
-               "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
-               X86_FEATURE_FXSR,
-               [fx] "m" (fpu->state->fxsave),
-               [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
+       if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
+               asm volatile("fnclex");
+
         /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
            is pending.  Clear the x87 state here by setting it to fixed
            values. safe_address is a random variable that should be in L1 */
         alternative_input(
-               GENERIC_NOP8 GENERIC_NOP2,
+               ASM_NOP8 ASM_NOP2,
                 "emms\n\t"              /* clear stack tags */
-               "fildl %[addr]",        /* set F?P to defined value */
+               "fildl %P[addr]",       /* set F?P to defined value */
                 X86_FEATURE_FXSAVE_LEAK,
                 [addr] "m" (safe_address));
-end:
-       ;
  }
  
  static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
         task_thread_info(tsk)->status &= ~TS_USEDFPU;
  }
  
-
-#endif /* CONFIG_X86_64 */
-
  static inline int fpu_fxrstor_checking(struct fpu *fpu)
  {
         return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
  static inline void __clear_fpu(struct task_struct *tsk)
  {
         if (task_thread_info(tsk)->status & TS_USEDFPU) {
-               tolerant_fwait();
+               /* Ignore delayed exceptions from user space */
+               asm volatile("1: fwait\n"
+                            "2:\n"
+                            _ASM_EXTABLE(1b, 2b));
                 task_thread_info(tsk)->status &= ~TS_USEDFPU;
                 stts();
         }
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
                 stts();
  }
  
-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-       __save_init_fpu(tsk);
-       stts();
-}
-
-#define unlazy_fpu     __unlazy_fpu
-#define clear_fpu      __clear_fpu
-
-#else  /* CONFIG_X86_32 */
-
  /*
   * These disable preemption on their own and are safe
   */
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
         preempt_enable();
  }
  
-#endif /* CONFIG_X86_64 */
-
  /*
   * i387 state interaction
   */
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
  
  #endif /* __ASSEMBLY__ */
  
-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
  #endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h

index 1655147..a203659 100644 (file)
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
  struct legacy_pic {
         int nr_legacy_irqs;
         struct irq_chip *chip;
+       void (*mask)(unsigned int irq);
+       void (*unmask)(unsigned int irq);
         void (*mask_all)(void);
         void (*restore_mask)(void);
         void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h

index 30a3e97..6a45ec4 100644 (file)
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
  
  extern void iounmap(volatile void __iomem *addr);
  
+extern void set_iounmap_nonlazy(void);
  
  #ifdef __KERNEL__
  
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h

index 9cb2edb..c8be456 100644 (file)
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -170,12 +170,6 @@ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  
  extern void probe_nr_irqs_gsi(void);
  
-extern int setup_ioapic_entry(int apic, int irq,
-                             struct IO_APIC_route_entry *entry,
-                             unsigned int destination, int trigger,
-                             int polarity, int vector, int pin);
-extern void ioapic_write_entry(int apic, int pin,
-                              struct IO_APIC_route_entry e);
  extern void setup_ioapic_ids_from_mpc(void);
  
  struct mp_ioapic_gsi{
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h

index f275e22..1c23360 100644 (file)
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
  
  #define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
  
+#ifdef CONFIG_INTR_REMAP
+static inline void prepare_irte(struct irte *irte, int vector,
+                               unsigned int dest)
+{
+       memset(irte, 0, sizeof(*irte));
+
+       irte->present = 1;
+       irte->dst_mode = apic->irq_dest_mode;
+       /*
+        * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
+        * actual level or edge trigger will be setup in the IO-APIC
+        * RTE. This will help simplify level triggered irq migration.
+        * For more details, see the comments (in io_apic.c) explainig IO-APIC
+        * irq migration in the presence of interrupt-remapping.
+       */
+       irte->trigger_mode = 0;
+       irte->dlvry_mode = apic->irq_delivery_mode;
+       irte->vector = vector;
+       irte->dest_id = IRTE_DEST(dest);
+       irte->redir_hint = 1;
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+       return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+{
+}
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+       return false;
+}
+#endif
+
  #endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h

deleted file mode 100644 (file)

index af00bd1..0000000
--- a/arch/x86/include/asm/k8.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef _ASM_X86_K8_H
-#define _ASM_X86_K8_H
-
-#include <linux/pci.h>
-
-extern struct pci_device_id k8_nb_ids[];
-struct bootnode;
-
-extern int early_is_k8_nb(u32 value);
-extern struct pci_dev **k8_northbridges;
-extern int num_k8_northbridges;
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
-
-#ifdef CONFIG_K8_NB
-extern int num_k8_northbridges;
-
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
-       return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
-}
-
-#else
-#define num_k8_northbridges 0
-
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
-{
-       return NULL;
-}
-#endif
-
-
-#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h

index 1635074..4a711a6 100644 (file)
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
   */
  #ifndef _ASM_X86_MRST_H
  #define _ASM_X86_MRST_H
+
+#include <linux/sfi.h>
+
  extern int pci_mrst_init(void);
  int __init sfi_parse_mrtc(struct sfi_table_header *table);
  
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
  };
  
  extern enum mrst_cpu_type __mrst_cpu_chip;
-static enum mrst_cpu_type mrst_identify_cpu(void)
+static inline enum mrst_cpu_type mrst_identify_cpu(void)
  {
         return __mrst_cpu_chip;
  }
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
  #define SFI_MTMR_MAX_NUM 8
  #define SFI_MRTC_MAX   8
  
+extern struct console early_mrst_console;
+extern void mrst_early_console_init(void);
+
+extern struct console early_hsu_console;
+extern void hsu_early_console_init(void);
  #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h

new file mode 100644 (file)

index 0000000..bcdff99
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
+#ifndef _ASM_X86_MWAIT_H
+#define _ASM_X86_MWAIT_H
+
+#define MWAIT_SUBSTATE_MASK            0xf
+#define MWAIT_CSTATE_MASK              0xf
+#define MWAIT_SUBSTATE_SIZE            4
+#define MWAIT_MAX_NUM_CSTATES          8
+
+#define CPUID_MWAIT_LEAF               5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK     0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK      0x1
+
+#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h

index 08fde47..2a84781 100644 (file)
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
  /* install OFW's pde permanently into the kernel's pgtable */
  extern void setup_olpc_ofw_pgd(void);
  
+/* check if OFW was detected during boot */
+extern bool olpc_ofw_present(void);
+
  #else /* !CONFIG_OLPC_OPENFIRMWARE */
  
  static inline void olpc_ofw_detect(void) { }
  static inline void setup_olpc_ofw_pgd(void) { }
+static inline bool olpc_ofw_present(void) { return false; }
  
  #endif /* !CONFIG_OLPC_OPENFIRMWARE */
  
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h

index a667f24..1df6621 100644 (file)
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
  #define PAGE_SIZE      (_AC(1,UL) << PAGE_SHIFT)
  #define PAGE_MASK      (~(PAGE_SIZE-1))
  
-#define __PHYSICAL_MASK                ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
+#define __PHYSICAL_MASK                ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
  #define __VIRTUAL_MASK         ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
  
  /* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h

index 5653f43..edecb4e 100644 (file)
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
         PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
  }
  
-static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
-                                           unsigned long start, unsigned long count)
-{
-       PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
-}
  static inline void paravirt_release_pmd(unsigned long pfn)
  {
         PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h

index db9ef55..b82bac9 100644 (file)
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
          */
         void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
         void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
-       void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
         void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
         void (*release_pte)(unsigned long pfn);
         void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h

index a34c785..ada823a 100644 (file)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
  extern spinlock_t pgd_lock;
  extern struct list_head pgd_list;
  
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt.h>
  #else  /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
         pte_update(mm, addr, ptep);
  }
  
+#define flush_tlb_fix_spurious_fault(vma, address)
+
  /*
   * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
   *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h

index 076052c..f96ac9b 100644 (file)
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
         native_set_pgd(pgd, native_make_pgd(0));
  }
  
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
  /*
   * Conversion functions: convert a page and protection to a page entry,
   * and a page entry and page directory to the page they refer to.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h

index 325b7bd..cae9c3c 100644 (file)
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
         u16                     phys_proc_id;
         /* Core id: */
         u16                     cpu_core_id;
+       /* Compute unit id */
+       u8                      compute_unit_id;
         /* Index into per_cpu list: */
         u16                     cpu_index;
  #endif
@@ -602,7 +604,7 @@ extern unsigned long                mmu_cr4_features;
  
  static inline void set_in_cr4(unsigned long mask)
  {
-       unsigned cr4;
+       unsigned long cr4;
  
         mmu_cr4_features |= mask;
         cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
  
  static inline void clear_in_cr4(unsigned long mask)
  {
-       unsigned cr4;
+       unsigned long cr4;
  
         mmu_cr4_features &= ~mask;
         cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long               idle_halt;
  extern unsigned long           idle_nomwait;
  extern bool                    c1e_detected;
  
-/*
- * on systems with caches, caches must be flashed as the absolute
- * last instruction before going into a suspended halt.  Otherwise,
- * dirty data can linger in the cache and become stale on resume,
- * leading to strange errors.
- *
- * perform a variety of operations to guarantee that the compiler
- * will not reorder instructions.  wbinvd itself is serializing
- * so the processor will not reorder.
- *
- * Systems without cache can just go into halt.
- */
-static inline void wbinvd_halt(void)
-{
-       mb();
-       /* check for clflush to determine if wbinvd is legal */
-       if (cpu_has_clflush)
-               asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
-       else
-               while (1)
-                       halt();
-}
-
  extern void enable_sep_cpu(void);
  extern int sysenter_setup(void);
  
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h

index ef292c7..d6763b1 100644 (file)
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
                         : : "i" (sz));                                  \
         }
  
+/* Helper for reserving space for arrays of things */
+#define RESERVE_BRK_ARRAY(type, name, entries)         \
+       type *name;                                     \
+       RESERVE_BRK(name, sizeof(type) * entries)
+
  #ifdef __i386__
  
  void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h

deleted file mode 100644 (file)

index 61e08c0..0000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * VMI interface definition
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Maintained by: Zachary Amsden zach@vmware.com
- *
- */
-#include <linux/types.h>
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI Option ROM API
- *
- *---------------------------------------------------------------------
- */
-#define VMI_SIGNATURE 0x696d5663   /* "cVmi" */
-
-#define PCI_VENDOR_ID_VMWARE            0x15AD
-#define PCI_DEVICE_ID_VMWARE_VMI        0x0801
-
-/*
- * We use two version numbers for compatibility, with the major
- * number signifying interface breakages, and the minor number
- * interface extensions.
- */
-#define VMI_API_REV_MAJOR       3
-#define VMI_API_REV_MINOR       0
-
-#define VMI_CALL_CPUID                 0
-#define VMI_CALL_WRMSR                 1
-#define VMI_CALL_RDMSR                 2
-#define VMI_CALL_SetGDT                        3
-#define VMI_CALL_SetLDT                        4
-#define VMI_CALL_SetIDT                        5
-#define VMI_CALL_SetTR                 6
-#define VMI_CALL_GetGDT                        7
-#define VMI_CALL_GetLDT                        8
-#define VMI_CALL_GetIDT                        9
-#define VMI_CALL_GetTR                 10
-#define VMI_CALL_WriteGDTEntry         11
-#define VMI_CALL_WriteLDTEntry         12
-#define VMI_CALL_WriteIDTEntry         13
-#define VMI_CALL_UpdateKernelStack     14
-#define VMI_CALL_SetCR0                        15
-#define VMI_CALL_SetCR2                        16
-#define VMI_CALL_SetCR3                        17
-#define VMI_CALL_SetCR4                        18
-#define VMI_CALL_GetCR0                        19
-#define VMI_CALL_GetCR2                        20
-#define VMI_CALL_GetCR3                        21
-#define VMI_CALL_GetCR4                        22
-#define VMI_CALL_WBINVD                        23
-#define VMI_CALL_SetDR                 24
-#define VMI_CALL_GetDR                 25
-#define VMI_CALL_RDPMC                 26
-#define VMI_CALL_RDTSC                 27
-#define VMI_CALL_CLTS                  28
-#define VMI_CALL_EnableInterrupts      29
-#define VMI_CALL_DisableInterrupts     30
-#define VMI_CALL_GetInterruptMask      31
-#define VMI_CALL_SetInterruptMask      32
-#define VMI_CALL_IRET                  33
-#define VMI_CALL_SYSEXIT               34
-#define VMI_CALL_Halt                  35
-#define VMI_CALL_Reboot                        36
-#define VMI_CALL_Shutdown              37
-#define VMI_CALL_SetPxE                        38
-#define VMI_CALL_SetPxELong            39
-#define VMI_CALL_UpdatePxE             40
-#define VMI_CALL_UpdatePxELong         41
-#define VMI_CALL_MachineToPhysical     42
-#define VMI_CALL_PhysicalToMachine     43
-#define VMI_CALL_AllocatePage          44
-#define VMI_CALL_ReleasePage           45
-#define VMI_CALL_InvalPage             46
-#define VMI_CALL_FlushTLB              47
-#define VMI_CALL_SetLinearMapping      48
-
-#define VMI_CALL_SetIOPLMask           61
-#define VMI_CALL_SetInitialAPState     62
-#define VMI_CALL_APICWrite             63
-#define VMI_CALL_APICRead              64
-#define VMI_CALL_IODelay               65
-#define VMI_CALL_SetLazyMode           73
-
-/*
- *---------------------------------------------------------------------
- *
- * MMU operation flags
- *
- *---------------------------------------------------------------------
- */
-
-/* Flags used by VMI_{Allocate|Release}Page call */
-#define VMI_PAGE_PAE             0x10  /* Allocate PAE shadow */
-#define VMI_PAGE_CLONE           0x20  /* Clone from another shadow */
-#define VMI_PAGE_ZEROED          0x40  /* Page is pre-zeroed */
-
-
-/* Flags shared by Allocate|Release Page and PTE updates */
-#define VMI_PAGE_PT              0x01
-#define VMI_PAGE_PD              0x02
-#define VMI_PAGE_PDP             0x04
-#define VMI_PAGE_PML4            0x08
-
-#define VMI_PAGE_NORMAL          0x00 /* for debugging */
-
-/* Flags used by PTE updates */
-#define VMI_PAGE_CURRENT_AS      0x10 /* implies VMI_PAGE_VA_MASK is valid */
-#define VMI_PAGE_DEFER           0x20 /* may queue update until TLB inval */
-#define VMI_PAGE_VA_MASK         0xfffff000
-
-#ifdef CONFIG_X86_PAE
-#define VMI_PAGE_L1            (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2            (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
-#else
-#define VMI_PAGE_L1            (VMI_PAGE_PT | VMI_PAGE_ZEROED)
-#define VMI_PAGE_L2            (VMI_PAGE_PD | VMI_PAGE_ZEROED)
-#endif
-
-/* Flags used by VMI_FlushTLB call */
-#define VMI_FLUSH_TLB            0x01
-#define VMI_FLUSH_GLOBAL         0x02
-
-/*
- *---------------------------------------------------------------------
- *
- *  VMI relocation definitions for ROM call get_reloc
- *
- *---------------------------------------------------------------------
- */
-
-/* VMI Relocation types */
-#define VMI_RELOCATION_NONE     0
-#define VMI_RELOCATION_CALL_REL 1
-#define VMI_RELOCATION_JUMP_REL 2
-#define VMI_RELOCATION_NOP     3
-
-#ifndef __ASSEMBLY__
-struct vmi_relocation_info {
-       unsigned char           *eip;
-       unsigned char           type;
-       unsigned char           reserved[3];
-};
-#endif
-
-
-/*
- *---------------------------------------------------------------------
- *
- *  Generic ROM structures and definitions
- *
- *---------------------------------------------------------------------
- */
-
-#ifndef __ASSEMBLY__
-
-struct vrom_header {
-       u16     rom_signature;  /* option ROM signature */
-       u8      rom_length;     /* ROM length in 512 byte chunks */
-       u8      rom_entry[4];   /* 16-bit code entry point */
-       u8      rom_pad0;       /* 4-byte align pad */
-       u32     vrom_signature; /* VROM identification signature */
-       u8      api_version_min;/* Minor version of API */
-       u8      api_version_maj;/* Major version of API */
-       u8      jump_slots;     /* Number of jump slots */
-       u8      reserved1;      /* Reserved for expansion */
-       u32     virtual_top;    /* Hypervisor virtual address start */
-       u16     reserved2;      /* Reserved for expansion */
-       u16     license_offs;   /* Offset to License string */
-       u16     pci_header_offs;/* Offset to PCI OPROM header */
-       u16     pnp_header_offs;/* Offset to PnP OPROM header */
-       u32     rom_pad3;       /* PnP reserverd / VMI reserved */
-       u8      reserved[96];   /* Reserved for headers */
-       char    vmi_init[8];    /* VMI_Init jump point */
-       char    get_reloc[8];   /* VMI_GetRelocationInfo jump point */
-} __attribute__((packed));
-
-struct pnp_header {
-       char sig[4];
-       char rev;
-       char size;
-       short next;
-       short res;
-       long devID;
-       unsigned short manufacturer_offset;
-       unsigned short product_offset;
-} __attribute__((packed));
-
-struct pci_header {
-       char sig[4];
-       short vendorID;
-       short deviceID;
-       short vpdData;
-       short size;
-       char rev;
-       char class;
-       char subclass;
-       char interface;
-       short chunks;
-       char rom_version_min;
-       char rom_version_maj;
-       char codetype;
-       char lastRom;
-       short reserved;
-} __attribute__((packed));
-
-/* Function prototypes for bootstrapping */
-#ifdef CONFIG_VMI
-extern void vmi_init(void);
-extern void vmi_activate(void);
-extern void vmi_bringup(void);
-#else
-static inline void vmi_init(void) {}
-static inline void vmi_activate(void) {}
-static inline void vmi_bringup(void) {}
-#endif
-
-/* State needed to start an application processor in an SMP system. */
-struct vmi_ap_state {
-       u32 cr0;
-       u32 cr2;
-       u32 cr3;
-       u32 cr4;
-
-       u64 efer;
-
-       u32 eip;
-       u32 eflags;
-       u32 eax;
-       u32 ebx;
-       u32 ecx;
-       u32 edx;
-       u32 esp;
-       u32 ebp;
-       u32 esi;
-       u32 edi;
-       u16 cs;
-       u16 ss;
-       u16 ds;
-       u16 es;
-       u16 fs;
-       u16 gs;
-       u16 ldtr;
-
-       u16 gdtr_limit;
-       u32 gdtr_base;
-       u32 idtr_base;
-       u16 idtr_limit;
-};
-
-#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h

deleted file mode 100644 (file)

index c6e0bee..0000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * VMI Time wrappers
- *
- * Copyright (C) 2006, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-#ifndef _ASM_X86_VMI_TIME_H
-#define _ASM_X86_VMI_TIME_H
-
-/*
- * Raw VMI call indices for timer functions
- */
-#define VMI_CALL_GetCycleFrequency     66
-#define VMI_CALL_GetCycleCounter       67
-#define VMI_CALL_SetAlarm              68
-#define VMI_CALL_CancelAlarm           69
-#define VMI_CALL_GetWallclockTime      70
-#define VMI_CALL_WallclockUpdated      71
-
-/* Cached VMI timer operations */
-extern struct vmi_timer_ops {
-       u64 (*get_cycle_frequency)(void);
-       u64 (*get_cycle_counter)(int);
-       u64 (*get_wallclock)(void);
-       int (*wallclock_updated)(void);
-       void (*set_alarm)(u32 flags, u64 expiry, u64 period);
-       void (*cancel_alarm)(u32 flags);
-} vmi_timer_ops;
-
-/* Prototypes */
-extern void __init vmi_time_init(void);
-extern unsigned long vmi_get_wallclock(void);
-extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_sched_clock(void);
-extern unsigned long vmi_tsc_khz(void);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern void __devinit vmi_time_bsp_init(void);
-extern void __devinit vmi_time_ap_init(void);
-#endif
-
-/*
- * When run under a hypervisor, a vcpu is always in one of three states:
- * running, halted, or ready.  The vcpu is in the 'running' state if it
- * is executing.  When the vcpu executes the halt interface, the vcpu
- * enters the 'halted' state and remains halted until there is some work
- * pending for the vcpu (e.g. an alarm expires, host I/O completes on
- * behalf of virtual I/O).  At this point, the vcpu enters the 'ready'
- * state (waiting for the hypervisor to reschedule it).  Finally, at any
- * time when the vcpu is not in the 'running' state nor the 'halted'
- * state, it is in the 'ready' state.
- *
- * Real time is advances while the vcpu is 'running', 'ready', or
- * 'halted'.  Stolen time is the time in which the vcpu is in the
- * 'ready' state.  Available time is the remaining time -- the vcpu is
- * either 'running' or 'halted'.
- *
- * All three views of time are accessible through the VMI cycle
- * counters.
- */
-
-/* The cycle counters. */
-#define VMI_CYCLES_REAL         0
-#define VMI_CYCLES_AVAILABLE    1
-#define VMI_CYCLES_STOLEN       2
-
-/* The alarm interface 'flags' bits */
-#define VMI_ALARM_COUNTERS      2
-
-#define VMI_ALARM_COUNTER_MASK  0x000000ff
-
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-
-#define VMI_ALARM_IS_ONESHOT    0x00000000
-#define VMI_ALARM_IS_PERIODIC   0x00000100
-
-#define CONFIG_VMI_ALARM_HZ    100
-
-#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 7490bf8..80a93dc 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,15 +86,15 @@ obj-$(CONFIG_DOUBLEFAULT)   += doublefault_32.o
  obj-$(CONFIG_KGDB)             += kgdb.o
  obj-$(CONFIG_VM86)             += vm86_32.o
  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
  
  obj-$(CONFIG_HPET_TIMER)       += hpet.o
  obj-$(CONFIG_APB_TIMER)                += apb_timer.o
  
-obj-$(CONFIG_K8_NB)            += k8.o
+obj-$(CONFIG_AMD_NB)           += amd_nb.o
  obj-$(CONFIG_DEBUG_RODATA_TEST)        += test_rodata.o
  obj-$(CONFIG_DEBUG_NX_TEST)    += test_nx.o
  
-obj-$(CONFIG_VMI)              += vmi_32.o vmiclock_32.o
  obj-$(CONFIG_KVM_GUEST)                += kvm.o
  obj-$(CONFIG_KVM_CLOCK)                += kvmclock.o
  obj-$(CONFIG_PARAVIRT)         += paravirt.o paravirt_patch_$(BITS).o
@@ -107,6 +107,7 @@ obj-$(CONFIG_SCx200)                += scx200.o
  scx200-y                       += scx200_32.o
  
  obj-$(CONFIG_OLPC)             += olpc.o
+obj-$(CONFIG_OLPC_XO1)         += olpc-xo1.o
  obj-$(CONFIG_OLPC_OPENFIRMWARE)        += olpc_ofw.o
  obj-$(CONFIG_X86_MRST)         += mrst.o
  
@@ -123,7 +124,6 @@ obj-$(CONFIG_SWIOTLB)                       += pci-swiotlb.o
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
         obj-$(CONFIG_X86_UV)            += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
-       obj-$(CONFIG_X86_PM_TIMER)      += pmtimer_64.o
         obj-$(CONFIG_AUDIT)             += audit_64.o
  
         obj-$(CONFIG_GART_IOMMU)        += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c

index fb16f17..5812404 100644 (file)
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
  
  #include <acpi/processor.h>
  #include <asm/acpi.h>
+#include <asm/mwait.h>
  
  /*
   * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry;       /* per CPU ptr */
  
  static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
  
-#define MWAIT_SUBSTATE_MASK    (0xf)
-#define MWAIT_CSTATE_MASK      (0xf)
-#define MWAIT_SUBSTATE_SIZE    (4)
-
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK     (0x2)
-
-#define MWAIT_ECX_INTERRUPT_BREAK      (0x1)
-
  #define NATIVE_CSTATE_BEYOND_HALT      (2)
  
  static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c

index 679b645..d2fdb08 100644 (file)
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c

index 5a170cb..3cb482e 100644 (file)
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
+ * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
   * Author: Joerg Roedel <joerg.roedel@amd.com>
   *         Leo Duran <leo.duran@amd.com>
   *
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
         return 1UL << shift;
  }
  
+/* Access to l1 and l2 indexed register spaces */
+
+static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+       pci_read_config_dword(iommu->dev, 0xfc, &val);
+       return val;
+}
+
+static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
+       pci_write_config_dword(iommu->dev, 0xfc, val);
+       pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
+}
+
+static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
+{
+       u32 val;
+
+       pci_write_config_dword(iommu->dev, 0xf0, address);
+       pci_read_config_dword(iommu->dev, 0xf4, &val);
+       return val;
+}
+
+static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
+{
+       pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
+       pci_write_config_dword(iommu->dev, 0xf4, val);
+}
+
  /****************************************************************************
   *
   * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
  {
         int cap_ptr = iommu->cap_ptr;
         u32 range, misc;
+       int i, j;
  
         pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
                               &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
                                         MMIO_GET_LD(range));
         iommu->evt_msi_num = MMIO_MSI_NUM(misc);
  
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
-               pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
-               pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
-               pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
-       }
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * Some rd890 systems may not be fully reconfigured by the BIOS, so
+        * it's necessary for us to store this information so it can be
+        * reprogrammed on resume
+        */
+
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                             &iommu->stored_addr_lo);
+       pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                             &iommu->stored_addr_hi);
+
+       /* Low bit locks writes to configuration space */
+       iommu->stored_addr_lo &= ~1;
+
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
+
+       for (i = 0; i < 0x83; i++)
+               iommu->stored_l2[i] = iommu_read_l2(iommu, i);
  }
  
  /*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
         iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
  }
  
-static void iommu_apply_quirks(struct amd_iommu *iommu)
+static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
  {
-       if (is_rd890_iommu(iommu->dev)) {
-               pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
-               pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
-               pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
-               pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
-       }
+       int i, j;
+       u32 ioc_feature_control;
+       struct pci_dev *pdev = NULL;
+
+       /* RD890 BIOSes may not have completely reconfigured the iommu */
+       if (!is_rd890_iommu(iommu->dev))
+               return;
+
+       /*
+        * First, we need to ensure that the iommu is enabled. This is
+        * controlled by a register in the northbridge
+        */
+       pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
+
+       if (!pdev)
+               return;
+
+       /* Select Northbridge indirect register 0x75 and enable writing */
+       pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
+       pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
+
+       /* Enable the iommu */
+       if (!(ioc_feature_control & 0x1))
+               pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
+
+       pci_dev_put(pdev);
+
+       /* Restore the iommu BAR */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo);
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
+                              iommu->stored_addr_hi);
+
+       /* Restore the l1 indirect regs for each of the 6 l1s */
+       for (i = 0; i < 6; i++)
+               for (j = 0; j < 0x12; j++)
+                       iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
+
+       /* Restore the l2 indirect regs */
+       for (i = 0; i < 0x83; i++)
+               iommu_write_l2(iommu, i, iommu->stored_l2[i]);
+
+       /* Lock PCI setup registers */
+       pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
+                              iommu->stored_addr_lo | 1);
  }
  
  /*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
  
         for_each_iommu(iommu) {
                 iommu_disable(iommu);
-               iommu_apply_quirks(iommu);
                 iommu_init_flags(iommu);
                 iommu_set_device_table(iommu);
                 iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
  
  static int amd_iommu_resume(struct sys_device *dev)
  {
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               iommu_apply_resume_quirks(iommu);
+
         /* re-load the hardware */
         enable_iommus();
  
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c

new file mode 100644 (file)

index 0000000..8f6463d
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,147 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/amd_nb.h>
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
+       {}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct k8_northbridge_info k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+       do {
+               dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+               if (!dev)
+                       break;
+       } while (!pci_match_id(&k8_nb_ids[0], dev));
+       return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+       int i;
+       struct pci_dev *dev;
+
+       if (k8_northbridges.num)
+               return 0;
+
+       dev = NULL;
+       while ((dev = next_k8_northbridge(dev)) != NULL)
+               k8_northbridges.num++;
+
+       /* some CPU families (e.g. family 0x11) do not support GART */
+       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+           boot_cpu_data.x86 == 0x15)
+               k8_northbridges.gart_supported = 1;
+
+       k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
+                                         sizeof(void *), GFP_KERNEL);
+       if (!k8_northbridges.nb_misc)
+               return -ENOMEM;
+
+       if (!k8_northbridges.num) {
+               k8_northbridges.nb_misc[0] = NULL;
+               return 0;
+       }
+
+       if (k8_northbridges.gart_supported) {
+               flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
+                                     GFP_KERNEL);
+               if (!flush_words) {
+                       kfree(k8_northbridges.nb_misc);
+                       return -ENOMEM;
+               }
+       }
+
+       dev = NULL;
+       i = 0;
+       while ((dev = next_k8_northbridge(dev)) != NULL) {
+               k8_northbridges.nb_misc[i] = dev;
+               if (k8_northbridges.gart_supported)
+                       pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
+       }
+       k8_northbridges.nb_misc[i] = NULL;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+   they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+       struct pci_device_id *id;
+       u32 vendor = device & 0xffff;
+       device >>= 16;
+       for (id = k8_nb_ids; id->vendor; id++)
+               if (vendor == id->vendor && device == id->device)
+                       return 1;
+       return 0;
+}
+
+void k8_flush_garts(void)
+{
+       int flushed, i;
+       unsigned long flags;
+       static DEFINE_SPINLOCK(gart_lock);
+
+       if (!k8_northbridges.gart_supported)
+               return;
+
+       /* Avoid races between AGP and IOMMU. In theory it's not needed
+          but I'm not sure if the hardware won't lose flush requests
+          when another is pending. This whole thing is so expensive anyways
+          that it doesn't matter to serialize more. -AK */
+       spin_lock_irqsave(&gart_lock, flags);
+       flushed = 0;
+       for (i = 0; i < k8_northbridges.num; i++) {
+               pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
+                                      flush_words[i]|1);
+               flushed++;
+       }
+       for (i = 0; i < k8_northbridges.num; i++) {
+               u32 w;
+               /* Make sure the hardware actually executed the flush*/
+               for (;;) {
+                       pci_read_config_dword(k8_northbridges.nb_misc[i],
+                                             0x9c, &w);
+                       if (!(w & 1))
+                               break;
+                       cpu_relax();
+               }
+       }
+       spin_unlock_irqrestore(&gart_lock, flags);
+       if (!flushed)
+               printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
+static __init int init_k8_nbs(void)
+{
+       int err = 0;
+
+       err = cache_k8_northbridges();
+
+       if (err < 0)
+               printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+
+       return err;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c

index 8dd7780..92543c7 100644 (file)
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
         apbt_start_counter(phy_cs_timer_id);
  }
  
-/* Setup IRQ routing via IOAPIC */
-#ifdef CONFIG_SMP
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
-       struct irq_chip *chip;
-       struct irq_desc *desc;
-
-       /* timer0 irq has been setup early */
-       if (adev->irq == 0)
-               return;
-       desc = irq_to_desc(adev->irq);
-       chip = get_irq_chip(adev->irq);
-       disable_irq(adev->irq);
-       desc->status |= IRQ_MOVE_PCNTXT;
-       irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
-       /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
-       set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
-       enable_irq(adev->irq);
-       if (system_state == SYSTEM_BOOTING)
-               if (request_irq(adev->irq, apbt_interrupt_handler,
-                               IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-                               adev->name, adev)) {
-                       printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-                              adev->num);
-               }
-}
-#endif
-
  static void apbt_enable_int(int n)
  {
         unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void)
  }
  
  #ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+       /* timer0 irq has been setup early */
+       if (adev->irq == 0)
+               return;
+
+       if (system_state == SYSTEM_BOOTING) {
+               irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+               /* APB timer irqs are set up as mp_irqs, timer is edge type */
+               __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+               if (request_irq(adev->irq, apbt_interrupt_handler,
+                               IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+                               adev->name, adev)) {
+                       printk(KERN_ERR "Failed request IRQ for APBT%d\n",
+                              adev->num);
+               }
+       } else
+               enable_irq(adev->irq);
+}
+
  /* Should be called with per cpu */
  void apbt_setup_secondary_clock(void)
  {
@@ -343,7 +336,7 @@ void apbt_setup_secondary_clock(void)
  
         /* Don't register boot CPU clockevent */
         cpu = smp_processor_id();
-       if (cpu == boot_cpu_id)
+       if (!cpu)
                 return;
         /*
          * We need to calculate the scaled math multiplication factor for
@@ -389,16 +382,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
  
         switch (action & 0xf) {
         case CPU_DEAD:
+               disable_irq(adev->irq);
                 apbt_disable_int(cpu);
-               if (system_state == SYSTEM_RUNNING)
+               if (system_state == SYSTEM_RUNNING) {
                         pr_debug("skipping APBT CPU %lu offline\n", cpu);
-               else if (adev) {
+               } else if (adev) {
                         pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
                         free_irq(adev->irq, adev);
                 }
                 break;
         default:
-               pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
+               pr_debug("APBT notified %lu, no action\n", action);
         }
         return NOTIFY_OK;
  }
@@ -552,7 +546,7 @@ bad_count:
                 pr_debug("APB CS going back %lx:%lx:%lx ",
                          t2, last_read, t2 - last_read);
  bad_count_x3:
-               pr_debug(KERN_INFO "tripple check enforced\n");
+               pr_debug("triple check enforced\n");
                 t0 = apbt_readl(phy_cs_timer_id,
                                 APBTMR_N_CURRENT_VALUE);
                 udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index a2e0caf..377f5db 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
  #include <asm/gart.h>
  #include <asm/pci-direct.h>
  #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  #include <asm/x86_init.h>
  
  int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       aper_enabled = ctl & AMD64_GARTEN;
+                       aper_enabled = ctl & GARTEN;
                         aper_order = (ctl >> 1) & 7;
                         aper_size = (32 * 1024 * 1024) << aper_order;
                         aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
-                       ctl &= ~AMD64_GARTEN;
+                       ctl &= ~GARTEN;
                         write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                 }
         }
@@ -505,8 +505,13 @@ out:
  
         /* Fix up the north bridges */
         for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
-               int bus;
-               int dev_base, dev_limit;
+               int bus, dev_base, dev_limit;
+
+               /*
+                * Don't enable translation yet but enable GART IO and CPU
+                * accesses and set DISTLBWALKPRB since GART table memory is UC.
+                */
+               u32 ctl = DISTLBWALKPRB | aper_order << 1;
  
                 bus = bus_dev_ranges[i].bus;
                 dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
                         if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
-                       /* Don't enable translation yet. That is done later.
-                          Assume this BIOS didn't initialise the GART so
-                          just overwrite all previous bits */
-                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+                       write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
                         write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
                 }
         }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index e3b534c..850657d 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
  #include <asm/mce.h>
  #include <asm/kvm_para.h>
  #include <asm/tsc.h>
+#include <asm/atomic.h>
  
  unsigned int num_processors;
  
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
  }
  
  /*
- * Setup extended LVT, AMD specific (K8, family 10h)
+ * Setup extended LVT, AMD specific
   *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ * Software should use the LVT offsets the BIOS provides.  The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS.  On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
   *
- * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
   */
  
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
  
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
  {
-       unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
-       unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-
-       apic_write(reg, v);
+       return (old & APIC_EILVT_MASKED)
+               || (new == APIC_EILVT_MASKED)
+               || ((new & ~APIC_EILVT_MASKED) == old);
  }
  
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
  {
-       setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
-       return APIC_EILVT_LVTOFF_MCE;
+       unsigned int rsvd;                      /* 0: uninitialized */
+
+       if (offset >= APIC_EILVT_NR_MAX)
+               return ~0;
+
+       rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+       do {
+               if (rsvd &&
+                   !eilvt_entry_is_changeable(rsvd, new))
+                       /* may not change if vectors are different */
+                       return rsvd;
+               rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+       } while (rsvd != new);
+
+       return new;
  }
  
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
  {
-       setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
-       return APIC_EILVT_LVTOFF_IBS;
+       unsigned long reg = APIC_EILVTn(offset);
+       unsigned int new, old, reserved;
+
+       new = (mask << 16) | (msg_type << 8) | vector;
+       old = apic_read(reg);
+       reserved = reserve_eilvt_offset(offset, new);
+
+       if (reserved != new) {
+               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
+                      "vector 0x%x was already reserved by another core, "
+                      "APIC%lX=0x%x\n",
+                      smp_processor_id(), new, reserved, reg, old);
+               return -EINVAL;
+       }
+
+       if (!eilvt_entry_is_changeable(old, new)) {
+               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
+                      "register already in use, APIC%lX=0x%x\n",
+                      smp_processor_id(), new, reg, old);
+               return -EBUSY;
+       }
+
+       apic_write(reg, new);
+
+       return 0;
  }
-EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
  
  /*
   * Program the next event, relative to now
@@ -1665,10 +1715,7 @@ int __init APIC_init_uniprocessor(void)
         }
  #endif
  
-#ifndef CONFIG_SMP
-       enable_IR_x2apic();
         default_setup_apic_routing();
-#endif
  
         verify_local_APIC();
         connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c

index 5c5b8f3..8ae808d 100644 (file)
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
         struct irq_pin_list *next;
  };
  
-static struct irq_pin_list *get_one_free_irq_2_pin(int node)
+static struct irq_pin_list *alloc_irq_pin_list(int node)
  {
-       struct irq_pin_list *pin;
-
-       pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
-
-       return pin;
+       return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
  }
  
  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -150,10 +146,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
  int __init arch_early_irq_init(void)
  {
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
-       int count;
-       int node;
-       int i;
+       int count, node, i;
  
         if (!legacy_pic->nr_legacy_irqs) {
                 nr_irqs_gsi = 0;
@@ -162,13 +155,15 @@ int __init arch_early_irq_init(void)
  
         cfg = irq_cfgx;
         count = ARRAY_SIZE(irq_cfgx);
-       node= cpu_to_node(boot_cpu_id);
+       node = cpu_to_node(0);
+
+       /* Make sure the legacy interrupts are marked in the bitmap */
+       irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
  
         for (i = 0; i < count; i++) {
-               desc = irq_to_desc(i);
-               desc->chip_data = &cfg[i];
-               zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
-               zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+               set_irq_chip_data(i, &cfg[i]);
+               zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+               zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
                 /*
                  * For legacy IRQ's, start with assigning irq0 to irq15 to
                  * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +178,88 @@ int __init arch_early_irq_init(void)
  }
  
  #ifdef CONFIG_SPARSE_IRQ
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct irq_cfg *irq_cfg(unsigned int irq)
  {
-       struct irq_cfg *cfg = NULL;
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       if (desc)
-               cfg = desc->chip_data;
-
-       return cfg;
+       return get_irq_chip_data(irq);
  }
  
-static struct irq_cfg *get_one_free_irq_cfg(int node)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
  {
         struct irq_cfg *cfg;
  
-       cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
-       if (cfg) {
-               if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
-                       kfree(cfg);
-                       cfg = NULL;
-               } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
-                                                         GFP_ATOMIC, node)) {
-                       free_cpumask_var(cfg->domain);
-                       kfree(cfg);
-                       cfg = NULL;
-               }
-       }
-
+       cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+       if (!cfg)
+               return NULL;
+       if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+               goto out_cfg;
+       if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+               goto out_domain;
         return cfg;
+out_domain:
+       free_cpumask_var(cfg->domain);
+out_cfg:
+       kfree(cfg);
+       return NULL;
  }
  
-int arch_init_chip_data(struct irq_desc *desc, int node)
-{
-       struct irq_cfg *cfg;
-
-       cfg = desc->chip_data;
-       if (!cfg) {
-               desc->chip_data = get_one_free_irq_cfg(node);
-               if (!desc->chip_data) {
-                       printk(KERN_ERR "can not alloc irq_cfg\n");
-                       BUG_ON(1);
-               }
-       }
-
-       return 0;
-}
-
-/* for move_irq_desc */
-static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
  {
-       struct irq_pin_list *old_entry, *head, *tail, *entry;
-
-       cfg->irq_2_pin = NULL;
-       old_entry = old_cfg->irq_2_pin;
-       if (!old_entry)
-               return;
-
-       entry = get_one_free_irq_2_pin(node);
-       if (!entry)
+       if (!cfg)
                 return;
+       set_irq_chip_data(at, NULL);
+       free_cpumask_var(cfg->domain);
+       free_cpumask_var(cfg->old_domain);
+       kfree(cfg);
+}
  
-       entry->apic     = old_entry->apic;
-       entry->pin      = old_entry->pin;
-       head            = entry;
-       tail            = entry;
-       old_entry       = old_entry->next;
-       while (old_entry) {
-               entry = get_one_free_irq_2_pin(node);
-               if (!entry) {
-                       entry = head;
-                       while (entry) {
-                               head = entry->next;
-                               kfree(entry);
-                               entry = head;
-                       }
-                       /* still use the old one */
-                       return;
-               }
-               entry->apic     = old_entry->apic;
-               entry->pin      = old_entry->pin;
-               tail->next      = entry;
-               tail            = entry;
-               old_entry       = old_entry->next;
-       }
+#else
  
-       tail->next = NULL;
-       cfg->irq_2_pin = head;
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+       return irq < nr_irqs ? irq_cfgx + irq : NULL;
  }
  
-static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
  {
-       struct irq_pin_list *entry, *next;
-
-       if (old_cfg->irq_2_pin == cfg->irq_2_pin)
-               return;
+       return irq_cfgx + irq;
+}
  
-       entry = old_cfg->irq_2_pin;
+static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
  
-       while (entry) {
-               next = entry->next;
-               kfree(entry);
-               entry = next;
-       }
-       old_cfg->irq_2_pin = NULL;
-}
+#endif
  
-void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                struct irq_desc *desc, int node)
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
  {
+       int res = irq_alloc_desc_at(at, node);
         struct irq_cfg *cfg;
-       struct irq_cfg *old_cfg;
-
-       cfg = get_one_free_irq_cfg(node);
  
-       if (!cfg)
-               return;
-
-       desc->chip_data = cfg;
-
-       old_cfg = old_desc->chip_data;
-
-       cfg->vector = old_cfg->vector;
-       cfg->move_in_progress = old_cfg->move_in_progress;
-       cpumask_copy(cfg->domain, old_cfg->domain);
-       cpumask_copy(cfg->old_domain, old_cfg->old_domain);
-
-       init_copy_irq_2_pin(old_cfg, cfg, node);
-}
+       if (res < 0) {
+               if (res != -EEXIST)
+                       return NULL;
+               cfg = get_irq_chip_data(at);
+               if (cfg)
+                       return cfg;
+       }
  
-static void free_irq_cfg(struct irq_cfg *cfg)
-{
-       free_cpumask_var(cfg->domain);
-       free_cpumask_var(cfg->old_domain);
-       kfree(cfg);
+       cfg = alloc_irq_cfg(at, node);
+       if (cfg)
+               set_irq_chip_data(at, cfg);
+       else
+               irq_free_desc(at);
+       return cfg;
  }
  
-void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+static int alloc_irq_from(unsigned int from, int node)
  {
-       struct irq_cfg *old_cfg, *cfg;
-
-       old_cfg = old_desc->chip_data;
-       cfg = desc->chip_data;
-
-       if (old_cfg == cfg)
-               return;
-
-       if (old_cfg) {
-               free_irq_2_pin(old_cfg, cfg);
-               free_irq_cfg(old_cfg);
-               old_desc->chip_data = NULL;
-       }
+       return irq_alloc_desc_from(from, node);
  }
-/* end for move_irq_desc */
  
-#else
-struct irq_cfg *irq_cfg(unsigned int irq)
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
  {
-       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+       free_irq_cfg(at, cfg);
+       irq_free_desc(at);
  }
  
-#endif
-
  struct io_apic {
         unsigned int index;
         unsigned int unused[3];
@@ -451,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
         io_apic_write(apic, 0x10 + 2*pin, eu.w1);
  }
  
-void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
  {
         unsigned long flags;
         raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +394,7 @@ static void ioapic_mask_entry(int apic, int pin)
   * fast in the common case, and fast for shared ISA-space IRQs.
   */
  static int
-add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
  {
         struct irq_pin_list **last, *entry;
  
@@ -493,7 +406,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
                 last = &entry->next;
         }
  
-       entry = get_one_free_irq_2_pin(node);
+       entry = alloc_irq_pin_list(node);
         if (!entry) {
                 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
                                 node, apic, pin);
@@ -508,7 +421,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
  
  static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
  {
-       if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin))
+       if (__add_pin_to_irq_node(cfg, node, apic, pin))
                 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
  }
  
@@ -571,11 +484,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
                              IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
  }
  
-static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
-{
-       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
  static void io_apic_sync(struct irq_pin_list *entry)
  {
         /*
@@ -587,44 +495,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
         readl(&io_apic->data);
  }
  
-static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
+static void mask_ioapic(struct irq_cfg *cfg)
  {
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
         io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
-static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void mask_ioapic_irq(struct irq_data *data)
  {
-       struct irq_cfg *cfg = desc->chip_data;
-       unsigned long flags;
-
-       BUG_ON(!cfg);
+       mask_ioapic(data->chip_data);
+}
  
-       raw_spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(cfg);
-       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
  }
  
-static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
+static void unmask_ioapic(struct irq_cfg *cfg)
  {
-       struct irq_cfg *cfg = desc->chip_data;
         unsigned long flags;
  
         raw_spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(cfg);
+       __unmask_ioapic(cfg);
         raw_spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
-static void mask_IO_APIC_irq(unsigned int irq)
+static void unmask_ioapic_irq(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       mask_IO_APIC_irq_desc(desc);
-}
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       unmask_IO_APIC_irq_desc(desc);
+       unmask_ioapic(data->chip_data);
  }
  
  static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -694,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
         struct IO_APIC_route_entry **ioapic_entries;
  
         ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
-                               GFP_ATOMIC);
+                               GFP_KERNEL);
         if (!ioapic_entries)
                 return 0;
  
         for (apic = 0; apic < nr_ioapics; apic++) {
                 ioapic_entries[apic] =
                         kzalloc(sizeof(struct IO_APIC_route_entry) *
-                               nr_ioapic_registers[apic], GFP_ATOMIC);
+                               nr_ioapic_registers[apic], GFP_KERNEL);
                 if (!ioapic_entries[apic])
                         goto nomem;
         }
@@ -1259,7 +1160,6 @@ void __setup_vector_irq(int cpu)
         /* Initialize vector_irq on a new cpu */
         int irq, vector;
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
  
         /*
          * vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1168,10 @@ void __setup_vector_irq(int cpu)
          */
         raw_spin_lock(&vector_lock);
         /* Mark the inuse vectors */
-       for_each_irq_desc(irq, desc) {
-               cfg = desc->chip_data;
-
+       for_each_active_irq(irq) {
+               cfg = get_irq_chip_data(irq);
+               if (!cfg)
+                       continue;
                 /*
                  * If it is a legacy IRQ handled by the legacy PIC, this cpu
                  * will be part of the irq_cfg's domain.
@@ -1327,17 +1228,17 @@ static inline int IO_APIC_irq_trigger(int irq)
  }
  #endif
  
-static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
+static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
  {
  
         if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
             trigger == IOAPIC_LEVEL)
-               desc->status |= IRQ_LEVEL;
+               irq_set_status_flags(irq, IRQ_LEVEL);
         else
-               desc->status &= ~IRQ_LEVEL;
+               irq_clear_status_flags(irq, IRQ_LEVEL);
  
-       if (irq_remapped(irq)) {
-               desc->status |= IRQ_MOVE_PCNTXT;
+       if (irq_remapped(get_irq_chip_data(irq))) {
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
                 if (trigger)
                         set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
                                                       handle_fasteoi_irq,
@@ -1358,10 +1259,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
                                               handle_edge_irq, "edge");
  }
  
-int setup_ioapic_entry(int apic_id, int irq,
-                      struct IO_APIC_route_entry *entry,
-                      unsigned int destination, int trigger,
-                      int polarity, int vector, int pin)
+static int setup_ioapic_entry(int apic_id, int irq,
+                             struct IO_APIC_route_entry *entry,
+                             unsigned int destination, int trigger,
+                             int polarity, int vector, int pin)
  {
         /*
          * add it to the IO-APIC irq-routing table:
@@ -1382,21 +1283,7 @@ int setup_ioapic_entry(int apic_id, int irq,
                 if (index < 0)
                         panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
  
-               memset(&irte, 0, sizeof(irte));
-
-               irte.present = 1;
-               irte.dst_mode = apic->irq_dest_mode;
-               /*
-                * Trigger mode in the IRTE will always be edge, and the
-                * actual level or edge trigger will be setup in the IO-APIC
-                * RTE. This will help simplify level triggered irq migration.
-                * For more details, see the comments above explainig IO-APIC
-                * irq migration in the presence of interrupt-remapping.
-                */
-               irte.trigger_mode = 0;
-               irte.dlvry_mode = apic->irq_delivery_mode;
-               irte.vector = vector;
-               irte.dest_id = IRTE_DEST(destination);
+               prepare_irte(&irte, vector, destination);
  
                 /* Set source-id of interrupt request */
                 set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1318,14 @@ int setup_ioapic_entry(int apic_id, int irq,
         return 0;
  }
  
-static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc,
-                             int trigger, int polarity)
+static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
+                            struct irq_cfg *cfg, int trigger, int polarity)
  {
-       struct irq_cfg *cfg;
         struct IO_APIC_route_entry entry;
         unsigned int dest;
  
         if (!IO_APIC_IRQ(irq))
                 return;
-
-       cfg = desc->chip_data;
-
         /*
          * For legacy irqs, cfg->domain starts with cpu 0 for legacy
          * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1471,9 +1354,9 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
                 return;
         }
  
-       ioapic_register_intr(irq, desc, trigger);
+       ioapic_register_intr(irq, trigger);
         if (irq < legacy_pic->nr_legacy_irqs)
-               legacy_pic->chip->mask(irq);
+               legacy_pic->mask(irq);
  
         ioapic_write_entry(apic_id, pin, entry);
  }
@@ -1484,11 +1367,9 @@ static struct {
  
  static void __init setup_IO_APIC_irqs(void)
  {
-       int apic_id, pin, idx, irq;
-       int notcon = 0;
-       struct irq_desc *desc;
+       int apic_id, pin, idx, irq, notcon = 0;
+       int node = cpu_to_node(0);
         struct irq_cfg *cfg;
-       int node = cpu_to_node(boot_cpu_id);
  
         apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
  
@@ -1525,19 +1406,17 @@ static void __init setup_IO_APIC_irqs(void)
                                 apic->multi_timer_check(apic_id, irq))
                         continue;
  
-               desc = irq_to_desc_alloc_node(irq, node);
-               if (!desc) {
-                       printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+               cfg = alloc_irq_and_cfg_at(irq, node);
+               if (!cfg)
                         continue;
-               }
-               cfg = desc->chip_data;
+
                 add_pin_to_irq_node(cfg, node, apic_id, pin);
                 /*
                  * don't mark it in pin_programmed, so later acpi could
                  * set it correctly when irq < 16
                  */
-               setup_IO_APIC_irq(apic_id, pin, irq, desc,
-                               irq_trigger(idx), irq_polarity(idx));
+               setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
+                                 irq_polarity(idx));
         }
  
         if (notcon)
@@ -1552,9 +1431,7 @@ static void __init setup_IO_APIC_irqs(void)
   */
  void setup_IO_APIC_irq_extra(u32 gsi)
  {
-       int apic_id = 0, pin, idx, irq;
-       int node = cpu_to_node(boot_cpu_id);
-       struct irq_desc *desc;
+       int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
         struct irq_cfg *cfg;
  
         /*
@@ -1570,18 +1447,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
                 return;
  
         irq = pin_2_irq(idx, apic_id, pin);
-#ifdef CONFIG_SPARSE_IRQ
-       desc = irq_to_desc(irq);
-       if (desc)
+
+       /* Only handle the non legacy irqs on secondary ioapics */
+       if (apic_id == 0 || irq < NR_IRQS_LEGACY)
                 return;
-#endif
-       desc = irq_to_desc_alloc_node(irq, node);
-       if (!desc) {
-               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+
+       cfg = alloc_irq_and_cfg_at(irq, node);
+       if (!cfg)
                 return;
-       }
  
-       cfg = desc->chip_data;
         add_pin_to_irq_node(cfg, node, apic_id, pin);
  
         if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1591,7 +1465,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
         }
         set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
  
-       setup_IO_APIC_irq(apic_id, pin, irq, desc,
+       setup_ioapic_irq(apic_id, pin, irq, cfg,
                         irq_trigger(idx), irq_polarity(idx));
  }
  
@@ -1642,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
         union IO_APIC_reg_03 reg_03;
         unsigned long flags;
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
         unsigned int irq;
  
         printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1729,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void)
         }
         }
         printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for_each_irq_desc(irq, desc) {
+       for_each_active_irq(irq) {
                 struct irq_pin_list *entry;
  
-               cfg = desc->chip_data;
+               cfg = get_irq_chip_data(irq);
                 if (!cfg)
                         continue;
                 entry = cfg->irq_2_pin;
@@ -2239,29 +2112,26 @@ static int __init timer_irq_works(void)
   * an edge even if it isn't on the 8259A...
   */
  
-static unsigned int startup_ioapic_irq(unsigned int irq)
+static unsigned int startup_ioapic_irq(struct irq_data *data)
  {
-       int was_pending = 0;
+       int was_pending = 0, irq = data->irq;
         unsigned long flags;
-       struct irq_cfg *cfg;
  
         raw_spin_lock_irqsave(&ioapic_lock, flags);
         if (irq < legacy_pic->nr_legacy_irqs) {
-               legacy_pic->chip->mask(irq);
+               legacy_pic->mask(irq);
                 if (legacy_pic->irq_pending(irq))
                         was_pending = 1;
         }
-       cfg = irq_cfg(irq);
-       __unmask_IO_APIC_irq(cfg);
+       __unmask_ioapic(data->chip_data);
         raw_spin_unlock_irqrestore(&ioapic_lock, flags);
  
         return was_pending;
  }
  
-static int ioapic_retrigger_irq(unsigned int irq)
+static int ioapic_retrigger_irq(struct irq_data *data)
  {
-
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_cfg *cfg = data->chip_data;
         unsigned long flags;
  
         raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2182,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
                  * With interrupt-remapping, destination information comes
                  * from interrupt-remapping table entry.
                  */
-               if (!irq_remapped(irq))
+               if (!irq_remapped(cfg))
                         io_apic_write(apic, 0x11 + pin*2, dest);
                 reg = io_apic_read(apic, 0x10 + pin*2);
                 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2192,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
  }
  
  /*
- * Either sets desc->affinity to a valid value, and returns
+ * Either sets data->affinity to a valid value, and returns
   * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves desc->affinity untouched.
+ * leaves data->affinity untouched.
   */
-unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
-                 unsigned int *dest_id)
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                         unsigned int *dest_id)
  {
-       struct irq_cfg *cfg;
-       unsigned int irq;
+       struct irq_cfg *cfg = data->chip_data;
  
         if (!cpumask_intersects(mask, cpu_online_mask))
                 return -1;
  
-       irq = desc->irq;
-       cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
+       if (assign_irq_vector(data->irq, data->chip_data, mask))
                 return -1;
  
-       cpumask_copy(desc->affinity, mask);
+       cpumask_copy(data->affinity, mask);
  
-       *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+       *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
         return 0;
  }
  
  static int
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                   bool force)
  {
-       struct irq_cfg *cfg;
+       unsigned int dest, irq = data->irq;
         unsigned long flags;
-       unsigned int dest;
-       unsigned int irq;
-       int ret = -1;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
+       int ret;
  
         raw_spin_lock_irqsave(&ioapic_lock, flags);
-       ret = set_desc_affinity(desc, mask, &dest);
+       ret = __ioapic_set_affinity(data, mask, &dest);
         if (!ret) {
                 /* Only the high 8 bits are valid. */
                 dest = SET_APIC_LOGICAL_ID(dest);
-               __target_IO_APIC_irq(irq, dest, cfg);
+               __target_IO_APIC_irq(irq, dest, data->chip_data);
         }
         raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
         return ret;
  }
  
-static int
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-
-       return set_ioapic_affinity_irq_desc(desc, mask);
-}
-
  #ifdef CONFIG_INTR_REMAP
  
  /*
@@ -2395,24 +2246,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
   * the interrupt-remapping table entry.
   */
  static int
-migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                      bool force)
  {
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest, irq = data->irq;
         struct irte irte;
-       unsigned int dest;
-       unsigned int irq;
-       int ret = -1;
  
         if (!cpumask_intersects(mask, cpu_online_mask))
-               return ret;
+               return -EINVAL;
  
-       irq = desc->irq;
         if (get_irte(irq, &irte))
-               return ret;
+               return -EBUSY;
  
-       cfg = desc->chip_data;
         if (assign_irq_vector(irq, cfg, mask))
-               return ret;
+               return -EBUSY;
  
         dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
  
@@ -2427,29 +2275,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
         if (cfg->move_in_progress)
                 send_cleanup_vector(cfg);
  
-       cpumask_copy(desc->affinity, mask);
-
+       cpumask_copy(data->affinity, mask);
         return 0;
  }
  
-/*
- * Migrates the IRQ destination in the process context.
- */
-static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-                                           const struct cpumask *mask)
-{
-       return migrate_ioapic_irq_desc(desc, mask);
-}
-static int set_ir_ioapic_affinity_irq(unsigned int irq,
-                                      const struct cpumask *mask)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       return set_ir_ioapic_affinity_irq_desc(desc, mask);
-}
  #else
-static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
-                                                  const struct cpumask *mask)
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                      bool force)
  {
         return 0;
  }
@@ -2511,10 +2344,8 @@ unlock:
         irq_exit();
  }
  
-static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
  {
-       struct irq_desc *desc = *descp;
-       struct irq_cfg *cfg = desc->chip_data;
         unsigned me;
  
         if (likely(!cfg->move_in_progress))
@@ -2526,31 +2357,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
                 send_cleanup_vector(cfg);
  }
  
-static void irq_complete_move(struct irq_desc **descp)
+static void irq_complete_move(struct irq_cfg *cfg)
  {
-       __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+       __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
  }
  
  void irq_force_complete_move(int irq)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg = desc->chip_data;
+       struct irq_cfg *cfg = get_irq_chip_data(irq);
  
         if (!cfg)
                 return;
  
-       __irq_complete_move(&desc, cfg->vector);
+       __irq_complete_move(cfg, cfg->vector);
  }
  #else
-static inline void irq_complete_move(struct irq_desc **descp) {}
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
  #endif
  
-static void ack_apic_edge(unsigned int irq)
+static void ack_apic_edge(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       irq_complete_move(&desc);
-       move_native_irq(irq);
+       irq_complete_move(data->chip_data);
+       move_native_irq(data->irq);
         ack_APIC_irq();
  }
  
@@ -2572,10 +2400,12 @@ atomic_t irq_mis_count;
   * Otherwise, we simulate the EOI message manually by changing the trigger
   * mode to edge and then back to level, with RTE being masked during this.
  */
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
  {
         struct irq_pin_list *entry;
+       unsigned long flags;
  
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
         for_each_irq_pin(entry, cfg->irq_2_pin) {
                 if (mp_ioapics[entry->apic].apicver >= 0x20) {
                         /*
@@ -2584,7 +2414,7 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
                          * intr-remapping table entry. Hence for the io-apic
                          * EOI we use the pin number.
                          */
-                       if (irq_remapped(irq))
+                       if (irq_remapped(cfg))
                                 io_apic_eoi(entry->apic, entry->pin);
                         else
                                 io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2423,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
                         __unmask_and_level_IO_APIC_irq(entry);
                 }
         }
-}
-
-static void eoi_ioapic_irq(struct irq_desc *desc)
-{
-       struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int irq;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-
-       raw_spin_lock_irqsave(&ioapic_lock, flags);
-       __eoi_ioapic_irq(irq, cfg);
         raw_spin_unlock_irqrestore(&ioapic_lock, flags);
  }
  
-static void ack_apic_level(unsigned int irq)
+static void ack_apic_level(struct irq_data *data)
  {
+       struct irq_cfg *cfg = data->chip_data;
+       int i, do_unmask_irq = 0, irq = data->irq;
         struct irq_desc *desc = irq_to_desc(irq);
         unsigned long v;
-       int i;
-       struct irq_cfg *cfg;
-       int do_unmask_irq = 0;
  
-       irq_complete_move(&desc);
+       irq_complete_move(cfg);
  #ifdef CONFIG_GENERIC_PENDING_IRQ
         /* If we are moving the irq we need to mask it */
         if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
                 do_unmask_irq = 1;
-               mask_IO_APIC_irq_desc(desc);
+               mask_ioapic(cfg);
         }
  #endif
  
@@ -2658,7 +2474,6 @@ static void ack_apic_level(unsigned int irq)
          * we use the above logic (mask+edge followed by unmask+level) from
          * Manfred Spraul to clear the remote IRR.
          */
-       cfg = desc->chip_data;
         i = cfg->vector;
         v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
  
@@ -2678,7 +2493,7 @@ static void ack_apic_level(unsigned int irq)
         if (!(v & (1 << (i & 0x1f)))) {
                 atomic_inc(&irq_mis_count);
  
-               eoi_ioapic_irq(desc);
+               eoi_ioapic_irq(irq, cfg);
         }
  
         /* Now we can move and renable the irq */
@@ -2709,61 +2524,57 @@ static void ack_apic_level(unsigned int irq)
                  * accurate and is causing problems then it is a hardware bug
                  * and you can go talk to the chipset vendor about it.
                  */
-               cfg = desc->chip_data;
                 if (!io_apic_level_ack_pending(cfg))
                         move_masked_irq(irq);
-               unmask_IO_APIC_irq_desc(desc);
+               unmask_ioapic(cfg);
         }
  }
  
  #ifdef CONFIG_INTR_REMAP
-static void ir_ack_apic_edge(unsigned int irq)
+static void ir_ack_apic_edge(struct irq_data *data)
  {
         ack_APIC_irq();
  }
  
-static void ir_ack_apic_level(unsigned int irq)
+static void ir_ack_apic_level(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-
         ack_APIC_irq();
-       eoi_ioapic_irq(desc);
+       eoi_ioapic_irq(data->irq, data->chip_data);
  }
  #endif /* CONFIG_INTR_REMAP */
  
  static struct irq_chip ioapic_chip __read_mostly = {
-       .name           = "IO-APIC",
-       .startup        = startup_ioapic_irq,
-       .mask           = mask_IO_APIC_irq,
-       .unmask         = unmask_IO_APIC_irq,
-       .ack            = ack_apic_edge,
-       .eoi            = ack_apic_level,
+       .name                   = "IO-APIC",
+       .irq_startup            = startup_ioapic_irq,
+       .irq_mask               = mask_ioapic_irq,
+       .irq_unmask             = unmask_ioapic_irq,
+       .irq_ack                = ack_apic_edge,
+       .irq_eoi                = ack_apic_level,
  #ifdef CONFIG_SMP
-       .set_affinity   = set_ioapic_affinity_irq,
+       .irq_set_affinity       = ioapic_set_affinity,
  #endif
-       .retrigger      = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  static struct irq_chip ir_ioapic_chip __read_mostly = {
-       .name           = "IR-IO-APIC",
-       .startup        = startup_ioapic_irq,
-       .mask           = mask_IO_APIC_irq,
-       .unmask         = unmask_IO_APIC_irq,
+       .name                   = "IR-IO-APIC",
+       .irq_startup            = startup_ioapic_irq,
+       .irq_mask               = mask_ioapic_irq,
+       .irq_unmask             = unmask_ioapic_irq,
  #ifdef CONFIG_INTR_REMAP
-       .ack            = ir_ack_apic_edge,
-       .eoi            = ir_ack_apic_level,
+       .irq_ack                = ir_ack_apic_edge,
+       .irq_eoi                = ir_ack_apic_level,
  #ifdef CONFIG_SMP
-       .set_affinity   = set_ir_ioapic_affinity_irq,
+       .irq_set_affinity       = ir_ioapic_set_affinity,
  #endif
  #endif
-       .retrigger      = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  static inline void init_IO_APIC_traps(void)
  {
-       int irq;
-       struct irq_desc *desc;
         struct irq_cfg *cfg;
+       unsigned int irq;
  
         /*
          * NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
          * Also, we've got to be careful not to trash gate
          * 0x80, because int 0x80 is hm, kind of importantish. ;)
          */
-       for_each_irq_desc(irq, desc) {
-               cfg = desc->chip_data;
+       for_each_active_irq(irq) {
+               cfg = get_irq_chip_data(irq);
                 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                         /*
                          * Hmm.. We don't have an entry for this,
@@ -2788,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
                                 legacy_pic->make_irq(irq);
                         else
                                 /* Strange. Oh, well.. */
-                               desc->chip = &no_irq_chip;
+                               set_irq_chip(irq, &no_irq_chip);
                 }
         }
  }
@@ -2797,7 +2608,7 @@ static inline void init_IO_APIC_traps(void)
   * The local APIC irq-chip implementation:
   */
  
-static void mask_lapic_irq(unsigned int irq)
+static void mask_lapic_irq(struct irq_data *data)
  {
         unsigned long v;
  
@@ -2805,7 +2616,7 @@ static void mask_lapic_irq(unsigned int irq)
         apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
  }
  
-static void unmask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(struct irq_data *data)
  {
         unsigned long v;
  
@@ -2813,21 +2624,21 @@ static void unmask_lapic_irq(unsigned int irq)
         apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
  }
  
-static void ack_lapic_irq(unsigned int irq)
+static void ack_lapic_irq(struct irq_data *data)
  {
         ack_APIC_irq();
  }
  
  static struct irq_chip lapic_chip __read_mostly = {
         .name           = "local-APIC",
-       .mask           = mask_lapic_irq,
-       .unmask         = unmask_lapic_irq,
-       .ack            = ack_lapic_irq,
+       .irq_mask       = mask_lapic_irq,
+       .irq_unmask     = unmask_lapic_irq,
+       .irq_ack        = ack_lapic_irq,
  };
  
-static void lapic_register_intr(int irq, struct irq_desc *desc)
+static void lapic_register_intr(int irq)
  {
-       desc->status &= ~IRQ_LEVEL;
+       irq_clear_status_flags(irq, IRQ_LEVEL);
         set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                       "edge");
  }
@@ -2930,9 +2741,8 @@ int timer_through_8259 __initdata;
   */
  static inline void __init check_timer(void)
  {
-       struct irq_desc *desc = irq_to_desc(0);
-       struct irq_cfg *cfg = desc->chip_data;
-       int node = cpu_to_node(boot_cpu_id);
+       struct irq_cfg *cfg = get_irq_chip_data(0);
+       int node = cpu_to_node(0);
         int apic1, pin1, apic2, pin2;
         unsigned long flags;
         int no_pin1 = 0;
@@ -2942,7 +2752,7 @@ static inline void __init check_timer(void)
         /*
          * get/set the timer IRQ vector:
          */
-       legacy_pic->chip->mask(0);
+       legacy_pic->mask(0);
         assign_irq_vector(0, cfg, apic->target_cpus());
  
         /*
@@ -3001,7 +2811,7 @@ static inline void __init check_timer(void)
                         add_pin_to_irq_node(cfg, node, apic1, pin1);
                         setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                 } else {
-                       /* for edge trigger, setup_IO_APIC_irq already
+                       /* for edge trigger, setup_ioapic_irq already
                          * leave it unmasked.
                          * so only need to unmask if it is level-trigger
                          * do we really have level trigger timer?
@@ -3009,12 +2819,12 @@ static inline void __init check_timer(void)
                         int idx;
                         idx = find_irq_entry(apic1, pin1, mp_INT);
                         if (idx != -1 && irq_trigger(idx))
-                               unmask_IO_APIC_irq_desc(desc);
+                               unmask_ioapic(cfg);
                 }
                 if (timer_irq_works()) {
                         if (nmi_watchdog == NMI_IO_APIC) {
                                 setup_nmi();
-                               legacy_pic->chip->unmask(0);
+                               legacy_pic->unmask(0);
                         }
                         if (disable_timer_pin_1 > 0)
                                 clear_IO_APIC_pin(0, pin1);
@@ -3037,14 +2847,14 @@ static inline void __init check_timer(void)
                  */
                 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
                 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-               legacy_pic->chip->unmask(0);
+               legacy_pic->unmask(0);
                 if (timer_irq_works()) {
                         apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                         timer_through_8259 = 1;
                         if (nmi_watchdog == NMI_IO_APIC) {
-                               legacy_pic->chip->mask(0);
+                               legacy_pic->mask(0);
                                 setup_nmi();
-                               legacy_pic->chip->unmask(0);
+                               legacy_pic->unmask(0);
                         }
                         goto out;
                 }
@@ -3052,7 +2862,7 @@ static inline void __init check_timer(void)
                  * Cleanup, just in case ...
                  */
                 local_irq_disable();
-               legacy_pic->chip->mask(0);
+               legacy_pic->mask(0);
                 clear_IO_APIC_pin(apic2, pin2);
                 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
         }
@@ -3069,16 +2879,16 @@ static inline void __init check_timer(void)
         apic_printk(APIC_QUIET, KERN_INFO
                     "...trying to set up timer as Virtual Wire IRQ...\n");
  
-       lapic_register_intr(0, desc);
+       lapic_register_intr(0);
         apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
-       legacy_pic->chip->unmask(0);
+       legacy_pic->unmask(0);
  
         if (timer_irq_works()) {
                 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
                 goto out;
         }
         local_irq_disable();
-       legacy_pic->chip->mask(0);
+       legacy_pic->mask(0);
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
         apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
  
@@ -3244,49 +3054,42 @@ device_initcall(ioapic_init_sysfs);
  /*
   * Dynamic irq allocate and deallocation
   */
-unsigned int create_irq_nr(unsigned int irq_want, int node)
+unsigned int create_irq_nr(unsigned int from, int node)
  {
-       /* Allocate an unused irq */
-       unsigned int irq;
-       unsigned int new;
+       struct irq_cfg *cfg;
         unsigned long flags;
-       struct irq_cfg *cfg_new = NULL;
-       struct irq_desc *desc_new = NULL;
-
-       irq = 0;
-       if (irq_want < nr_irqs_gsi)
-               irq_want = nr_irqs_gsi;
-
-       raw_spin_lock_irqsave(&vector_lock, flags);
-       for (new = irq_want; new < nr_irqs; new++) {
-               desc_new = irq_to_desc_alloc_node(new, node);
-               if (!desc_new) {
-                       printk(KERN_INFO "can not get irq_desc for %d\n", new);
-                       continue;
-               }
-               cfg_new = desc_new->chip_data;
-
-               if (cfg_new->vector != 0)
-                       continue;
+       unsigned int ret = 0;
+       int irq;
  
-               desc_new = move_irq_desc(desc_new, node);
-               cfg_new = desc_new->chip_data;
+       if (from < nr_irqs_gsi)
+               from = nr_irqs_gsi;
  
-               if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
-                       irq = new;
-               break;
+       irq = alloc_irq_from(from, node);
+       if (irq < 0)
+               return 0;
+       cfg = alloc_irq_cfg(irq, node);
+       if (!cfg) {
+               free_irq_at(irq, NULL);
+               return 0;
         }
-       raw_spin_unlock_irqrestore(&vector_lock, flags);
  
-       if (irq > 0)
-               dynamic_irq_init_keep_chip_data(irq);
+       raw_spin_lock_irqsave(&vector_lock, flags);
+       if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+               ret = irq;
+       raw_spin_unlock_irqrestore(&vector_lock, flags);
  
-       return irq;
+       if (ret) {
+               set_irq_chip_data(irq, cfg);
+               irq_clear_status_flags(irq, IRQ_NOREQUEST);
+       } else {
+               free_irq_at(irq, cfg);
+       }
+       return ret;
  }
  
  int create_irq(void)
  {
-       int node = cpu_to_node(boot_cpu_id);
+       int node = cpu_to_node(0);
         unsigned int irq_want;
         int irq;
  
@@ -3301,14 +3104,17 @@ int create_irq(void)
  
  void destroy_irq(unsigned int irq)
  {
+       struct irq_cfg *cfg = get_irq_chip_data(irq);
         unsigned long flags;
  
-       dynamic_irq_cleanup_keep_chip_data(irq);
+       irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
  
-       free_irte(irq);
+       if (intr_remapping_enabled)
+               free_irte(irq);
         raw_spin_lock_irqsave(&vector_lock, flags);
-       __clear_irq_vector(irq, get_irq_chip_data(irq));
+       __clear_irq_vector(irq, cfg);
         raw_spin_unlock_irqrestore(&vector_lock, flags);
+       free_irq_at(irq, cfg);
  }
  
  /*
@@ -3332,7 +3138,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
  
         dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
  
-       if (irq_remapped(irq)) {
+       if (irq_remapped(get_irq_chip_data(irq))) {
                 struct irte irte;
                 int ir_index;
                 u16 sub_handle;
@@ -3340,14 +3146,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
                 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
                 BUG_ON(ir_index == -1);
  
-               memset (&irte, 0, sizeof(irte));
-
-               irte.present = 1;
-               irte.dst_mode = apic->irq_dest_mode;
-               irte.trigger_mode = 0; /* edge */
-               irte.dlvry_mode = apic->irq_delivery_mode;
-               irte.vector = cfg->vector;
-               irte.dest_id = IRTE_DEST(dest);
+               prepare_irte(&irte, cfg->vector, dest);
  
                 /* Set source-id of interrupt request */
                 if (pdev)
@@ -3392,26 +3191,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
  }
  
  #ifdef CONFIG_SMP
-static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = data->chip_data;
         struct msi_msg msg;
         unsigned int dest;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
-       cfg = desc->chip_data;
-
-       get_cached_msi_msg_desc(desc, &msg);
+       __get_cached_msi_msg(data->msi_desc, &msg);
  
         msg.data &= ~MSI_DATA_VECTOR_MASK;
         msg.data |= MSI_DATA_VECTOR(cfg->vector);
         msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
-       write_msi_msg_desc(desc, &msg);
+       __write_msi_msg(data->msi_desc, &msg);
  
         return 0;
  }
@@ -3421,17 +3218,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
   * done in the process context using interrupt-remapping hardware.
   */
  static int
-ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                   bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg = desc->chip_data;
-       unsigned int dest;
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest, irq = data->irq;
         struct irte irte;
  
         if (get_irte(irq, &irte))
                 return -1;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
         irte.vector = cfg->vector;
@@ -3461,27 +3258,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
   * which implement the MSI or MSI-X Capability Structure.
   */
  static struct irq_chip msi_chip = {
-       .name           = "PCI-MSI",
-       .unmask         = unmask_msi_irq,
-       .mask           = mask_msi_irq,
-       .ack            = ack_apic_edge,
+       .name                   = "PCI-MSI",
+       .irq_unmask             = unmask_msi_irq,
+       .irq_mask               = mask_msi_irq,
+       .irq_ack                = ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity   = set_msi_irq_affinity,
+       .irq_set_affinity       = msi_set_affinity,
  #endif
-       .retrigger      = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  static struct irq_chip msi_ir_chip = {
-       .name           = "IR-PCI-MSI",
-       .unmask         = unmask_msi_irq,
-       .mask           = mask_msi_irq,
+       .name                   = "IR-PCI-MSI",
+       .irq_unmask             = unmask_msi_irq,
+       .irq_mask               = mask_msi_irq,
  #ifdef CONFIG_INTR_REMAP
-       .ack            = ir_ack_apic_edge,
+       .irq_ack                = ir_ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity   = ir_set_msi_irq_affinity,
+       .irq_set_affinity       = ir_msi_set_affinity,
  #endif
  #endif
-       .retrigger      = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  /*
@@ -3513,8 +3310,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
  
  static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
  {
-       int ret;
         struct msi_msg msg;
+       int ret;
  
         ret = msi_compose_msg(dev, irq, &msg, -1);
         if (ret < 0)
@@ -3523,12 +3320,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
         set_irq_msi(irq, msidesc);
         write_msi_msg(irq, &msg);
  
-       if (irq_remapped(irq)) {
-               struct irq_desc *desc = irq_to_desc(irq);
-               /*
-                * irq migration in process context
-                */
-               desc->status |= IRQ_MOVE_PCNTXT;
+       if (irq_remapped(get_irq_chip_data(irq))) {
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
                 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
         } else
                 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3540,13 +3333,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
  
  int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
  {
-       unsigned int irq;
-       int ret, sub_handle;
+       int node, ret, sub_handle, index = 0;
+       unsigned int irq, irq_want;
         struct msi_desc *msidesc;
-       unsigned int irq_want;
         struct intel_iommu *iommu = NULL;
-       int index = 0;
-       int node;
  
         /* x86 doesn't support multiple MSI yet */
         if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3606,18 +3396,17 @@ void arch_teardown_msi_irq(unsigned int irq)
  
  #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
  #ifdef CONFIG_SMP
-static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                     bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest, irq = data->irq;
         struct msi_msg msg;
-       unsigned int dest;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
-       cfg = desc->chip_data;
-
         dmar_msi_read(irq, &msg);
  
         msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3633,14 +3422,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  #endif /* CONFIG_SMP */
  
  static struct irq_chip dmar_msi_type = {
-       .name = "DMAR_MSI",
-       .unmask = dmar_msi_unmask,
-       .mask = dmar_msi_mask,
-       .ack = ack_apic_edge,
+       .name                   = "DMAR_MSI",
+       .irq_unmask             = dmar_msi_unmask,
+       .irq_mask               = dmar_msi_mask,
+       .irq_ack                = ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity = dmar_msi_set_affinity,
+       .irq_set_affinity       = dmar_msi_set_affinity,
  #endif
-       .retrigger = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  int arch_setup_dmar_msi(unsigned int irq)
@@ -3661,26 +3450,24 @@ int arch_setup_dmar_msi(unsigned int irq)
  #ifdef CONFIG_HPET_TIMER
  
  #ifdef CONFIG_SMP
-static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(struct irq_data *data,
+                                const struct cpumask *mask, bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = data->chip_data;
         struct msi_msg msg;
         unsigned int dest;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
-       cfg = desc->chip_data;
-
-       hpet_msi_read(irq, &msg);
+       hpet_msi_read(data->handler_data, &msg);
  
         msg.data &= ~MSI_DATA_VECTOR_MASK;
         msg.data |= MSI_DATA_VECTOR(cfg->vector);
         msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
         msg.address_lo |= MSI_ADDR_DEST_ID(dest);
  
-       hpet_msi_write(irq, &msg);
+       hpet_msi_write(data->handler_data, &msg);
  
         return 0;
  }
@@ -3688,34 +3475,33 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
  #endif /* CONFIG_SMP */
  
  static struct irq_chip ir_hpet_msi_type = {
-       .name = "IR-HPET_MSI",
-       .unmask = hpet_msi_unmask,
-       .mask = hpet_msi_mask,
+       .name                   = "IR-HPET_MSI",
+       .irq_unmask             = hpet_msi_unmask,
+       .irq_mask               = hpet_msi_mask,
  #ifdef CONFIG_INTR_REMAP
-       .ack = ir_ack_apic_edge,
+       .irq_ack                = ir_ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity = ir_set_msi_irq_affinity,
+       .irq_set_affinity       = ir_msi_set_affinity,
  #endif
  #endif
-       .retrigger = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  static struct irq_chip hpet_msi_type = {
         .name = "HPET_MSI",
-       .unmask = hpet_msi_unmask,
-       .mask = hpet_msi_mask,
-       .ack = ack_apic_edge,
+       .irq_unmask = hpet_msi_unmask,
+       .irq_mask = hpet_msi_mask,
+       .irq_ack = ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity = hpet_msi_set_affinity,
+       .irq_set_affinity = hpet_msi_set_affinity,
  #endif
-       .retrigger = ioapic_retrigger_irq,
+       .irq_retrigger = ioapic_retrigger_irq,
  };
  
  int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
  {
-       int ret;
         struct msi_msg msg;
-       struct irq_desc *desc = irq_to_desc(irq);
+       int ret;
  
         if (intr_remapping_enabled) {
                 struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,9 +3519,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
         if (ret < 0)
                 return ret;
  
-       hpet_msi_write(irq, &msg);
-       desc->status |= IRQ_MOVE_PCNTXT;
-       if (irq_remapped(irq))
+       hpet_msi_write(get_irq_data(irq), &msg);
+       irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+       if (irq_remapped(get_irq_chip_data(irq)))
                 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
                                               handle_edge_irq, "edge");
         else
@@ -3768,33 +3554,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
         write_ht_irq_msg(irq, &msg);
  }
  
-static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = data->chip_data;
         unsigned int dest;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
-       cfg = desc->chip_data;
-
-       target_ht_irq(irq, dest, cfg->vector);
-
+       target_ht_irq(data->irq, dest, cfg->vector);
         return 0;
  }
  
  #endif
  
  static struct irq_chip ht_irq_chip = {
-       .name           = "PCI-HT",
-       .mask           = mask_ht_irq,
-       .unmask         = unmask_ht_irq,
-       .ack            = ack_apic_edge,
+       .name                   = "PCI-HT",
+       .irq_mask               = mask_ht_irq,
+       .irq_unmask             = unmask_ht_irq,
+       .irq_ack                = ack_apic_edge,
  #ifdef CONFIG_SMP
-       .set_affinity   = set_ht_irq_affinity,
+       .irq_set_affinity       = ht_set_affinity,
  #endif
-       .retrigger      = ioapic_retrigger_irq,
+       .irq_retrigger          = ioapic_retrigger_irq,
  };
  
  int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3885,14 +3668,13 @@ int __init arch_probe_nr_irqs(void)
         if (nr < nr_irqs)
                 nr_irqs = nr;
  
-       return 0;
+       return NR_IRQS_LEGACY;
  }
  #endif
  
  static int __io_apic_set_pci_routing(struct device *dev, int irq,
                                 struct io_apic_irq_attr *irq_attr)
  {
-       struct irq_desc *desc;
         struct irq_cfg *cfg;
         int node;
         int ioapic, pin;
@@ -3908,13 +3690,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
         if (dev)
                 node = dev_to_node(dev);
         else
-               node = cpu_to_node(boot_cpu_id);
+               node = cpu_to_node(0);
  
-       desc = irq_to_desc_alloc_node(irq, node);
-       if (!desc) {
-               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+       cfg = alloc_irq_and_cfg_at(irq, node);
+       if (!cfg)
                 return 0;
-       }
  
         pin = irq_attr->ioapic_pin;
         trigger = irq_attr->trigger;
@@ -3924,15 +3704,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
          * IRQs < 16 are already in the irq_2_pin[] map
          */
         if (irq >= legacy_pic->nr_legacy_irqs) {
-               cfg = desc->chip_data;
-               if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
+               if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
                         printk(KERN_INFO "can not add pin %d for irq %d\n",
                                 pin, irq);
                         return 0;
                 }
         }
  
-       setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+       setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
  
         return 0;
  }
@@ -4125,14 +3904,14 @@ void __init setup_ioapic_dest(void)
                  */
                 if (desc->status &
                     (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                       mask = desc->affinity;
+                       mask = desc->irq_data.affinity;
                 else
                         mask = apic->target_cpus();
  
                 if (intr_remapping_enabled)
-                       set_ir_ioapic_affinity_irq_desc(desc, mask);
+                       ir_ioapic_set_affinity(&desc->irq_data, mask, false);
                 else
-                       set_ioapic_affinity_irq_desc(desc, mask);
+                       ioapic_set_affinity(&desc->irq_data, mask, false);
         }
  
  }
@@ -4316,19 +4095,18 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
  void __init pre_init_apic_IRQ0(void)
  {
         struct irq_cfg *cfg;
-       struct irq_desc *desc;
  
         printk(KERN_INFO "Early APIC setup for system timer0\n");
  #ifndef CONFIG_SMP
         phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
  #endif
-       desc = irq_to_desc_alloc_node(0, 0);
+       /* Make sure the irq descriptor is set up */
+       cfg = alloc_irq_and_cfg_at(0, 0);
  
         setup_local_APIC();
  
-       cfg = irq_cfg(0);
         add_pin_to_irq_node(cfg, 0, 0, 0);
         set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
  
-       setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
+       setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
  }
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c

index a43f71c..c90041c 100644 (file)
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
  error:
         if (nmi_watchdog == NMI_IO_APIC) {
                 if (!timer_through_8259)
-                       legacy_pic->chip->mask(0);
+                       legacy_pic->mask(0);
                 on_each_cpu(__acpi_nmi_disable, NULL, 1);
         }
  
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c

index 83e9be4..f9e4e6a 100644 (file)
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
   */
  void __init default_setup_apic_routing(void)
  {
+
+       enable_IR_x2apic();
+
  #ifdef CONFIG_X86_X2APIC
         if (x2apic_mode
  #ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index ba5f62f..9e093f8 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_SMP
         /* calling is from identify_secondary_cpu() ? */
-       if (c->cpu_index == boot_cpu_id)
+       if (!c->cpu_index)
                 return;
  
         /*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
  #endif
  
  /*
- * Fixup core topology information for AMD multi-node processors.
- * Assumption: Number of cores in each internal node is the same.
+ * Fixup core topology information for
+ * (1) AMD multi-node processors
+ *     Assumption: Number of cores in each internal node is the same.
+ * (2) AMD processors supporting compute units
   */
  #ifdef CONFIG_X86_HT
-static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
+static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
  {
-       unsigned long long value;
-       u32 nodes, cores_per_node;
+       u32 nodes;
+       u8 node_id;
         int cpu = smp_processor_id();
  
-       if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
-               return;
+       /* get information required for multi-node processors */
+       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+               u32 eax, ebx, ecx, edx;
  
-       /* fixup topology information only once for a core */
-       if (cpu_has(c, X86_FEATURE_AMD_DCM))
-               return;
+               cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+               nodes = ((ecx >> 8) & 7) + 1;
+               node_id = ecx & 7;
  
-       rdmsrl(MSR_FAM10H_NODE_ID, value);
+               /* get compute unit information */
+               smp_num_siblings = ((ebx >> 8) & 3) + 1;
+               c->compute_unit_id = ebx & 0xff;
+       } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
+               u64 value;
  
-       nodes = ((value >> 3) & 7) + 1;
-       if (nodes == 1)
+               rdmsrl(MSR_FAM10H_NODE_ID, value);
+               nodes = ((value >> 3) & 7) + 1;
+               node_id = value & 7;
+       } else
                 return;
  
-       set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-       cores_per_node = c->x86_max_cores / nodes;
+       /* fixup multi-node processor information */
+       if (nodes > 1) {
+               u32 cores_per_node;
+
+               set_cpu_cap(c, X86_FEATURE_AMD_DCM);
+               cores_per_node = c->x86_max_cores / nodes;
  
-       /* store NodeID, use llc_shared_map to store sibling info */
-       per_cpu(cpu_llc_id, cpu) = value & 7;
+               /* store NodeID, use llc_shared_map to store sibling info */
+               per_cpu(cpu_llc_id, cpu) = node_id;
  
-       /* fixup core id to be in range from 0 to (cores_per_node - 1) */
-       c->cpu_core_id = c->cpu_core_id % cores_per_node;
+               /* core id to be in range from 0 to (cores_per_node - 1) */
+               c->cpu_core_id = c->cpu_core_id % cores_per_node;
+       }
  }
  #endif
  
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
         c->phys_proc_id = c->initial_apicid >> bits;
         /* use socket ID also for last level cache */
         per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-       /* fixup topology information on multi-node processors */
-       if ((c->x86 == 0x10) && (c->x86_model == 9))
-               amd_fixup_dcm(c);
+       amd_get_topology(c);
  #endif
  }
  
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
                         set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
         }
  #endif
+
+       /* We need to do the following only once */
+       if (c != &boot_cpu_data)
+               return;
+
+       if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+
+               if (c->x86 > 0x10 ||
+                   (c->x86 == 0x10 && c->x86_model >= 0x2)) {
+                       u64 val;
+
+                       rdmsrl(MSR_K7_HWCR, val);
+                       if (!(val & BIT(24)))
+                               printk(KERN_WARNING FW_BUG "TSC doesn't count "
+                                       "with P0 frequency!\n");
+               }
+       }
  }
  
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
  #endif
  
         if (c->extended_cpuid_level >= 0x80000006) {
-               if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
+               if (cpuid_edx(0x80000006) & 0xf000)
                         num_cache_leaves = 4;
                 else
                         num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index f2f9ac7..4b68bda 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
                 this_cpu->c_early_init(c);
  
  #ifdef CONFIG_SMP
-       c->cpu_index = boot_cpu_id;
+       c->cpu_index = 0;
  #endif
         filter_cpuid_features(c, false);
  }
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
  }
  
  /*
- * The NOPL instruction is supposed to exist on all CPUs with
- * family >= 6; unfortunately, that's not true in practice because
- * of early VIA chips and (more importantly) broken virtualizers that
- * are not easy to detect.  In the latter case it doesn't even *fail*
- * reliably, so probing for it doesn't even work.  Disable it completely
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
   * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
   */
  static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
  {
+#ifdef CONFIG_X86_32
         clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+       set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
  }
  
  static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
         clear_all_debug_regs();
         dbg_restore_debug_regs();
  
-       /*
-        * Force FPU initialization:
-        */
-       current_thread_info()->status = 0;
-       clear_used_math();
-       mxcsr_feature_mask_init();
-
         fpu_init();
         xsave_init();
  }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h

index f668bb1..e765633 100644 (file)
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
  extern const struct cpu_dev *const __x86_cpu_dev_start[],
                             *const __x86_cpu_dev_end[];
  
+extern void get_cpu_cap(struct cpuinfo_x86 *c);
  extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
  extern void get_cpu_cap(struct cpuinfo_x86 *c);
  
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index b438944..695f177 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -170,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_SMP
         /* calling is from identify_secondary_cpu() ? */
-       if (c->cpu_index == boot_cpu_id)
+       if (!c->cpu_index)
                 return;
  
         /*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index 898c2f4..12cd823 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
  
  #include <asm/processor.h>
  #include <linux/smp.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  #include <asm/smp.h>
  
  #define LVL_1_INST     1
@@ -306,7 +306,7 @@ struct _cache_attr {
         ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
  };
  
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
  
  /*
   * L3 cache descriptors
@@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
                         return;
  
         /* not in virtualized environments */
-       if (num_k8_northbridges == 0)
+       if (k8_northbridges.num == 0)
                 return;
  
         /*
@@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
          * never freed but this is done only on shutdown so it doesn't matter.
          */
         if (!l3_caches) {
-               int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+               int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
  
                 l3_caches = kzalloc(size, GFP_ATOMIC);
                 if (!l3_caches)
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
  static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
                 show_cache_disable_1, store_cache_disable_1);
  
-#else  /* CONFIG_CPU_SUP_AMD */
+#else  /* CONFIG_AMD_NB */
  static void __cpuinit
  amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
  {
  };
-#endif /* CONFIG_CPU_SUP_AMD */
+#endif /* CONFIG_AMD_NB */
  
  static int
  __cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
  
  static struct attribute *default_l3_attrs[] = {
         DEFAULT_SYSFS_CACHE_ATTRS,
-#ifdef CONFIG_CPU_SUP_AMD
+#ifdef CONFIG_AMD_NB
         &cache_disable_0.attr,
         &cache_disable_1.attr,
  #endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c

index 39aaee5..80c4823 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
         u32 low = 0, high = 0, address = 0;
         unsigned int bank, block;
         struct thresh_restart tr;
-       u8 lvt_off;
+       int lvt_off = -1;
+       u8 offset;
  
         for (bank = 0; bank < NR_BANKS; ++bank) {
                 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                         if (shared_bank[bank] && c->cpu_core_id)
                                 break;
  #endif
-                       lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
-                                                      APIC_EILVT_MSG_FIX, 0);
+                       offset = (high & MASK_LVTOFF_HI) >> 20;
+                       if (lvt_off < 0) {
+                               if (setup_APIC_eilvt(offset,
+                                                    THRESHOLD_APIC_VECTOR,
+                                                    APIC_EILVT_MSG_FIX, 0)) {
+                                       pr_err(FW_BUG "cpu %d, failed to "
+                                              "setup threshold interrupt "
+                                              "for bank %d, block %d "
+                                              "(MSR%08X=0x%x%08x)",
+                                              smp_processor_id(), bank, block,
+                                              address, high, low);
+                                       continue;
+                               }
+                               lvt_off = offset;
+                       } else if (lvt_off != offset) {
+                               pr_err(FW_BUG "cpu %d, invalid threshold "
+                                      "interrupt offset %d for bank %d,"
+                                      "block %d (MSR%08X=0x%x%08x)",
+                                      smp_processor_id(), lvt_off, bank,
+                                      block, address, high, low);
+                               continue;
+                       }
  
                         high &= ~MASK_LVTOFF_HI;
                         high |= lvt_off << 20;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c

index 169d880..4b68326 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
  
  static void unexpected_thermal_interrupt(void)
  {
-       printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
+       printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
                         smp_processor_id());
         add_taint(TAINT_MACHINE_CHECK);
  }
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c

index c5f59d0..ac140c7 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
  
         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
                 return 0;
-       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+       if (boot_cpu_data.x86 < 0xf)
                 return 0;
         /* In case some hypervisor doesn't pass SYSCFG through: */
         if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c

index 7d28d7d..9f27228 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
         }
  }
  
+/* Get the size of contiguous MTRR range */
+static u64 get_mtrr_size(u64 mask)
+{
+       u64 size;
+
+       mask >>= PAGE_SHIFT;
+       mask |= size_or_mask;
+       size = -mask;
+       size <<= PAGE_SHIFT;
+       return size;
+}
+
  /*
- * Returns the effective MTRR type for the region
- * Error returns:
- * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
- * - 0xFF - when MTRR is not enabled
+ * Check and return the effective type for MTRR-MTRR type overlap.
+ * Returns 1 if the effective type is UNCACHEABLE, else returns 0
   */
-u8 mtrr_type_lookup(u64 start, u64 end)
+static int check_type_overlap(u8 *prev, u8 *curr)
+{
+       if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
+               *prev = MTRR_TYPE_UNCACHABLE;
+               *curr = MTRR_TYPE_UNCACHABLE;
+               return 1;
+       }
+
+       if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
+           (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
+               *prev = MTRR_TYPE_WRTHROUGH;
+               *curr = MTRR_TYPE_WRTHROUGH;
+       }
+
+       if (*prev != *curr) {
+               *prev = MTRR_TYPE_UNCACHABLE;
+               *curr = MTRR_TYPE_UNCACHABLE;
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Error/Semi-error returns:
+ * 0xFF - when MTRR is not enabled
+ * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
+ *             corresponds only to [start:*partial_end].
+ *             Caller has to lookup again for [*partial_end:end].
+ */
+static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
  {
         int i;
         u64 base, mask;
         u8 prev_match, curr_match;
  
+       *repeat = 0;
         if (!mtrr_state_set)
                 return 0xFF;
  
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
  
                 start_state = ((start & mask) == (base & mask));
                 end_state = ((end & mask) == (base & mask));
-               if (start_state != end_state)
-                       return 0xFE;
+
+               if (start_state != end_state) {
+                       /*
+                        * We have start:end spanning across an MTRR.
+                        * We split the region into
+                        * either
+                        * (start:mtrr_end) (mtrr_end:end)
+                        * or
+                        * (start:mtrr_start) (mtrr_start:end)
+                        * depending on kind of overlap.
+                        * Return the type for first region and a pointer to
+                        * the start of second region so that caller will
+                        * lookup again on the second region.
+                        * Note: This way we handle multiple overlaps as well.
+                        */
+                       if (start_state)
+                               *partial_end = base + get_mtrr_size(mask);
+                       else
+                               *partial_end = base;
+
+                       if (unlikely(*partial_end <= start)) {
+                               WARN_ON(1);
+                               *partial_end = start + PAGE_SIZE;
+                       }
+
+                       end = *partial_end - 1; /* end is inclusive */
+                       *repeat = 1;
+               }
  
                 if ((start & mask) != (base & mask))
                         continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
                         continue;
                 }
  
-               if (prev_match == MTRR_TYPE_UNCACHABLE ||
-                   curr_match == MTRR_TYPE_UNCACHABLE) {
-                       return MTRR_TYPE_UNCACHABLE;
-               }
-
-               if ((prev_match == MTRR_TYPE_WRBACK &&
-                    curr_match == MTRR_TYPE_WRTHROUGH) ||
-                   (prev_match == MTRR_TYPE_WRTHROUGH &&
-                    curr_match == MTRR_TYPE_WRBACK)) {
-                       prev_match = MTRR_TYPE_WRTHROUGH;
-                       curr_match = MTRR_TYPE_WRTHROUGH;
-               }
-
-               if (prev_match != curr_match)
-                       return MTRR_TYPE_UNCACHABLE;
+               if (check_type_overlap(&prev_match, &curr_match))
+                       return curr_match;
         }
  
         if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
         return mtrr_state.def_type;
  }
  
+/*
+ * Returns the effective MTRR type for the region
+ * Error return:
+ * 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+       u8 type, prev_type;
+       int repeat;
+       u64 partial_end;
+
+       type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+       /*
+        * Common path is with repeat = 0.
+        * However, we can have cases where [start:end] spans across some
+        * MTRR range. Do repeated lookups for that case here.
+        */
+       while (repeat) {
+               prev_type = type;
+               start = partial_end;
+               type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+
+               if (check_type_overlap(&prev_type, &type))
+                       return type;
+       }
+
+       return type;
+}
+
  /* Get the MSR pair relating to a var range */
  static void
  get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c

index fb329e9..d9f4ff8 100644 (file)
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
  {
         switch (boot_cpu_data.x86_vendor) {
         case X86_VENDOR_AMD:
-               if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
-                   boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
-                       return;
-               wd_ops = &k7_wd_ops;
-               break;
+               if (boot_cpu_data.x86 == 6 ||
+                   (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
+                       wd_ops = &k7_wd_ops;
+               return;
         case X86_VENDOR_INTEL:
                 /* Work around where perfctr1 doesn't have a working enable
                  * bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c

index d490795..c7f64e6 100644 (file)
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
                 { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a, 0 },
                 { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a, 0 },
                 { X86_FEATURE_NRIPS,            CR_EDX, 3, 0x8000000a, 0 },
+               { X86_FEATURE_TSCRATEMSR,       CR_EDX, 4, 0x8000000a, 0 },
+               { X86_FEATURE_VMCBCLEAN,        CR_EDX, 5, 0x8000000a, 0 },
+               { X86_FEATURE_FLUSHBYASID,      CR_EDX, 6, 0x8000000a, 0 },
+               { X86_FEATURE_DECODEASSISTS,    CR_EDX, 7, 0x8000000a, 0 },
+               { X86_FEATURE_PAUSEFILTER,      CR_EDX,10, 0x8000000a, 0 },
+               { X86_FEATURE_PFTHRESHOLD,      CR_EDX,12, 0x8000000a, 0 },
                 { 0, 0, 0, 0, 0 }
         };
  
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c

index 045b36c..9948288 100644 (file)
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
         if (!csize)
                 return 0;
  
-       vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+       vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
         if (!vaddr)
                 return -ENOMEM;
  
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
         } else
                 memcpy(buf, vaddr + offset, csize);
  
+       set_iounmap_nonlazy();
         iounmap(vaddr);
         return csize;
  }
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c

index ebdb85c..76b8cd9 100644 (file)
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -96,7 +96,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
  
  }
  
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
  #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
  static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
  {
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
         d &= 0xff;
         return d;
  }
-#endif
  
  static void __init ati_bugs(int num, int slot, int func)
  {
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c

index fa99bae..4572f25 100644 (file)
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
  #include <xen/hvc-console.h>
  #include <asm/pci-direct.h>
  #include <asm/fixmap.h>
+#include <asm/mrst.h>
  #include <asm/pgtable.h>
  #include <linux/usb/ehci_def.h>
  
@@ -238,6 +239,18 @@ static int __init setup_early_printk(char *buf)
  #ifdef CONFIG_HVC_XEN
                 if (!strncmp(buf, "xen", 3))
                         early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+               if (!strncmp(buf, "mrst", 4)) {
+                       mrst_early_console_init();
+                       early_console_register(&early_mrst_console, keep);
+               }
+
+               if (!strncmp(buf, "hsu", 3)) {
+                       hsu_early_console_init();
+                       early_console_register(&early_hsu_console, keep);
+               }
+
  #endif
                 buf++;
         }
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c

new file mode 100644 (file)

index 0000000..65df603
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT               0x200000
+#define MRST_REGBASE_SPI0              0xff128000
+#define MRST_REGBASE_SPI1              0xff128400
+#define MRST_CLK_SPI0_REG              0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET                 0
+
+#define SPI_FRF_OFFSET                 4
+#define SPI_FRF_SPI                    0x0
+#define SPI_FRF_SSP                    0x1
+#define SPI_FRF_MICROWIRE              0x2
+#define SPI_FRF_RESV                   0x3
+
+#define SPI_MODE_OFFSET                        6
+#define SPI_SCPH_OFFSET                        6
+#define SPI_SCOL_OFFSET                        7
+#define SPI_TMOD_OFFSET                        8
+#define        SPI_TMOD_TR                     0x0             /* xmit & recv */
+#define SPI_TMOD_TO                    0x1             /* xmit only */
+#define SPI_TMOD_RO                    0x2             /* recv only */
+#define SPI_TMOD_EPROMREAD             0x3             /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET               10
+#define SPI_SRL_OFFSET                 11
+#define SPI_CFS_OFFSET                 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK                                0x7f            /* cover 7 bits */
+#define SR_BUSY                                (1 << 0)
+#define SR_TF_NOT_FULL                 (1 << 1)
+#define SR_TF_EMPT                     (1 << 2)
+#define SR_RF_NOT_EMPT                 (1 << 3)
+#define SR_RF_FULL                     (1 << 4)
+#define SR_TX_ERR                      (1 << 5)
+#define SR_DCOL                                (1 << 6)
+
+struct dw_spi_reg {
+       u32     ctrl0;
+       u32     ctrl1;
+       u32     ssienr;
+       u32     mwcr;
+       u32     ser;
+       u32     baudr;
+       u32     txfltr;
+       u32     rxfltr;
+       u32     txflr;
+       u32     rxflr;
+       u32     sr;
+       u32     imr;
+       u32     isr;
+       u32     risr;
+       u32     txoicr;
+       u32     rxoicr;
+       u32     rxuicr;
+       u32     msticr;
+       u32     icr;
+       u32     dmacr;
+       u32     dmatdlr;
+       u32     dmardlr;
+       u32     idr;
+       u32     version;
+
+       /* Currently operates as 32 bits, though only the low 16 bits matter */
+       u32     dr;
+} __packed;
+
+#define dw_readl(dw, name)             __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val)       __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+                       enum kmsg_dump_reason reason,
+                       const char *s1, unsigned long l1,
+                       const char *s2, unsigned long l2)
+{
+       int i;
+
+       /* When run to this, we'd better re-init the HW */
+       mrst_early_console_init();
+
+       for (i = 0; i < l1; i++)
+               early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+       for (i = 0; i < l2; i++)
+               early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+       u16 config;
+
+       config = 0xc001;
+       dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+       u16 data;
+
+       data = 0x8000 | c;
+       dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+       u32 ctrlr0 = 0;
+       u32 spi0_cdiv;
+       u32 freq; /* Freqency info only need be searched once */
+
+       /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+       pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                                       MRST_CLK_SPI0_REG);
+       spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+       freq = 100000000 / (spi0_cdiv + 1);
+
+       if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+               mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+       pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                               mrst_spi_paddr);
+
+       /* Disable SPI controller */
+       dw_writel(pspi, ssienr, 0);
+
+       /* Set control param, 8 bits, transmit only mode */
+       ctrlr0 = dw_readl(pspi, ctrl0);
+
+       ctrlr0 &= 0xfcc0;
+       ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+                     | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+       dw_writel(pspi, ctrl0, ctrlr0);
+
+       /*
+        * Change the spi0 clk to comply with 115200 bps, use 100000 to
+        * calculate the clk dividor to make the clock a little slower
+        * than real baud rate.
+        */
+       dw_writel(pspi, baudr, freq/100000);
+
+       /* Disable all INT for early phase */
+       dw_writel(pspi, imr, 0x0);
+
+       /* Set the cs to spi-uart */
+       dw_writel(pspi, ser, 0x2);
+
+       /* Enable the HW, the last step for HW init */
+       dw_writel(pspi, ssienr, 0x1);
+
+       /* Set the default configuration */
+       max3110_write_config();
+
+       /* Register the kmsg dumper */
+       if (!dumper_registered) {
+               dw_dumper.dump = dw_kmsg_dump;
+               kmsg_dump_register(&dw_dumper);
+               dumper_registered = 1;
+       }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+       unsigned int timeout;
+       u32 sr;
+
+       timeout = MRST_SPI_TIMEOUT;
+       /* Early putc needs to make sure the TX FIFO is not full */
+       while (--timeout) {
+               sr = dw_readl(pspi, sr);
+               if (!(sr & SR_TF_NOT_FULL))
+                       cpu_relax();
+               else
+                       break;
+       }
+
+       if (!timeout)
+               pr_warning("MRST earlycon: timed out\n");
+       else
+               max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+       int i;
+
+       for (i = 0; i < n && *str; i++) {
+               if (*str == '\n')
+                       early_mrst_spi_putc('\r');
+               early_mrst_spi_putc(*str);
+               str++;
+       }
+}
+
+struct console early_mrst_console = {
+       .name =         "earlymrst",
+       .write =        early_mrst_spi_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR                0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+       u8 lcr;
+
+       phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                                       HSU_PORT2_PADDR);
+
+       /* Disable FIFO */
+       writeb(0x0, phsu + UART_FCR);
+
+       /* Set to default 115200 bps, 8n1 */
+       lcr = readb(phsu + UART_LCR);
+       writeb((0x80 | lcr), phsu + UART_LCR);
+       writeb(0x18, phsu + UART_DLL);
+       writeb(lcr,  phsu + UART_LCR);
+       writel(0x3600, phsu + UART_MUL*4);
+
+       writeb(0x8, phsu + UART_MCR);
+       writeb(0x7, phsu + UART_FCR);
+       writeb(0x3, phsu + UART_LCR);
+
+       /* Clear IRQ status */
+       readb(phsu + UART_LSR);
+       readb(phsu + UART_RX);
+       readb(phsu + UART_IIR);
+       readb(phsu + UART_MSR);
+
+       /* Enable FIFO */
+       writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+       unsigned int timeout = 10000; /* 10ms */
+       u8 status;
+
+       while (--timeout) {
+               status = readb(phsu + UART_LSR);
+               if (status & BOTH_EMPTY)
+                       break;
+               udelay(1);
+       }
+
+       /* Only write the char when there was no timeout */
+       if (timeout)
+               writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+       int i;
+
+       for (i = 0; i < n && *str; i++) {
+               if (*str == '\n')
+                       early_hsu_putc('\r');
+               early_hsu_putc(*str);
+               str++;
+       }
+}
+
+struct console early_hsu_console = {
+       .name =         "earlyhsu",
+       .write =        early_hsu_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index 227d009..9fb188d 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
  
   /* unfortunately push/pop can't be no-op */
  .macro PUSH_GS
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
  .endm
  .macro POP_GS pop=0
         addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
  #else  /* CONFIG_X86_32_LAZY_GS */
  
  .macro PUSH_GS
-       pushl %gs
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %gs
         /*CFI_REL_OFFSET gs, 0*/
  .endm
  
  .macro POP_GS pop=0
-98:    popl %gs
-       CFI_ADJUST_CFA_OFFSET -4
+98:    popl_cfi %gs
         /*CFI_RESTORE gs*/
    .if \pop <> 0
         add $\pop, %esp
@@ -195,35 +192,25 @@
  .macro SAVE_ALL
         cld
         PUSH_GS
-       pushl %fs
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %fs
         /*CFI_REL_OFFSET fs, 0;*/
-       pushl %es
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %es
         /*CFI_REL_OFFSET es, 0;*/
-       pushl %ds
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ds
         /*CFI_REL_OFFSET ds, 0;*/
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         CFI_REL_OFFSET eax, 0
-       pushl %ebp
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebp
         CFI_REL_OFFSET ebp, 0
-       pushl %edi
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %edi
         CFI_REL_OFFSET edi, 0
-       pushl %esi
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %esi
         CFI_REL_OFFSET esi, 0
-       pushl %edx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %edx
         CFI_REL_OFFSET edx, 0
-       pushl %ecx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ecx
         CFI_REL_OFFSET ecx, 0
-       pushl %ebx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebx
         CFI_REL_OFFSET ebx, 0
         movl $(__USER_DS), %edx
         movl %edx, %ds
@@ -234,39 +221,29 @@
  .endm
  
  .macro RESTORE_INT_REGS
-       popl %ebx
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %ebx
         CFI_RESTORE ebx
-       popl %ecx
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %ecx
         CFI_RESTORE ecx
-       popl %edx
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %edx
         CFI_RESTORE edx
-       popl %esi
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %esi
         CFI_RESTORE esi
-       popl %edi
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %edi
         CFI_RESTORE edi
-       popl %ebp
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %ebp
         CFI_RESTORE ebp
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %eax
         CFI_RESTORE eax
  .endm
  
  .macro RESTORE_REGS pop=0
         RESTORE_INT_REGS
-1:     popl %ds
-       CFI_ADJUST_CFA_OFFSET -4
+1:     popl_cfi %ds
         /*CFI_RESTORE ds;*/
-2:     popl %es
-       CFI_ADJUST_CFA_OFFSET -4
+2:     popl_cfi %es
         /*CFI_RESTORE es;*/
-3:     popl %fs
-       CFI_ADJUST_CFA_OFFSET -4
+3:     popl_cfi %fs
         /*CFI_RESTORE fs;*/
         POP_GS \pop
  .pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
  
  ENTRY(ret_from_fork)
         CFI_STARTPROC
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         call schedule_tail
         GET_THREAD_INFO(%ebp)
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
-       pushl $0x0202                   # Reset kernel eflags
-       CFI_ADJUST_CFA_OFFSET 4
-       popfl
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %eax
+       pushl_cfi $0x0202               # Reset kernel eflags
+       popfl_cfi
         jmp syscall_exit
         CFI_ENDPROC
  END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
          * enough kernel state to call TRACE_IRQS_OFF can be called - but
          * we immediately enable interrupts at that point anyway.
          */
-       pushl $(__USER_DS)
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $(__USER_DS)
         /*CFI_REL_OFFSET ss, 0*/
-       pushl %ebp
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebp
         CFI_REL_OFFSET esp, 0
-       pushfl
+       pushfl_cfi
         orl $X86_EFLAGS_IF, (%esp)
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $(__USER_CS)
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $(__USER_CS)
         /*CFI_REL_OFFSET cs, 0*/
         /*
          * Push current_thread_info()->sysenter_return to the stack.
          * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
          * pushed above; +8 corresponds to copy_thread's esp0 setting.
          */
-       pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
         CFI_REL_OFFSET eip, 0
  
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         SAVE_ALL
         ENABLE_INTERRUPTS(CLBR_NONE)
  
@@ -486,8 +453,7 @@ sysenter_audit:
         movl %eax,%edx                  /* 2nd arg: syscall number */
         movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
         call audit_syscall_entry
-       pushl %ebx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebx
         movl PT_EAX(%esp),%eax          /* reload syscall number */
         jmp sysenter_do_call
  
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
         # system call handler stub
  ENTRY(system_call)
         RING0_INT_FRAME                 # can't unwind into user space anyway
-       pushl %eax                      # save orig_eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax                  # save orig_eax
         SAVE_ALL
         GET_THREAD_INFO(%ebp)
                                         # system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
         je ldt_ss                       # returning to user-space with LDT SS
  restore_nocheck:
         RESTORE_REGS 4                  # skip orig_eax/error_code
-       CFI_ADJUST_CFA_OFFSET -4
  irq_return:
         INTERRUPT_RETURN
  .section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
         shr $16, %edx
         mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
         mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-       pushl $__ESPFIX_SS
-       CFI_ADJUST_CFA_OFFSET 4
-       push %eax                       /* new kernel esp */
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $__ESPFIX_SS
+       pushl_cfi %eax                  /* new kernel esp */
         /* Disable interrupts, but do not irqtrace this section: we
          * will soon execute iret and the tracer was already set to
          * the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig:                            # deal with pending signals and
  
         ALIGN
  work_notifysig_v86:
-       pushl %ecx                      # save ti_flags for do_notify_resume
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ecx                  # save ti_flags for do_notify_resume
         call save_v86_state             # %eax contains pt_regs pointer
-       popl %ecx
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %ecx
         movl %eax, %esp
  #else
         movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
  #define PTREGSCALL3(name) \
         ALIGN; \
  ptregs_##name: \
+       CFI_STARTPROC; \
         leal 4(%esp),%eax; \
-       pushl %eax; \
+       pushl_cfi %eax; \
         movl PT_EDX(%eax),%ecx; \
         movl PT_ECX(%eax),%edx; \
         movl PT_EBX(%eax),%eax; \
         call sys_##name; \
         addl $4,%esp; \
-       ret
+       CFI_ADJUST_CFA_OFFSET -4; \
+       ret; \
+       CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
  
  PTREGSCALL1(iopl)
  PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
  /* Clone is an oddball.  The 4th arg is in %edi */
         ALIGN;
  ptregs_clone:
+       CFI_STARTPROC
         leal 4(%esp),%eax
-       pushl %eax
-       pushl PT_EDI(%eax)
+       pushl_cfi %eax
+       pushl_cfi PT_EDI(%eax)
         movl PT_EDX(%eax),%ecx
         movl PT_ECX(%eax),%edx
         movl PT_EBX(%eax),%eax
         call sys_clone
         addl $8,%esp
+       CFI_ADJUST_CFA_OFFSET -8
         ret
+       CFI_ENDPROC
+ENDPROC(ptregs_clone)
  
  .macro FIXUP_ESPFIX_STACK
  /*
@@ -795,10 +763,8 @@ ptregs_clone:
         mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
         shl $16, %eax
         addl %esp, %eax                 /* the adjusted stack pointer */
-       pushl $__KERNEL_DS
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $__KERNEL_DS
+       pushl_cfi %eax
         lss (%esp), %esp                /* switch to the normal stack segment */
         CFI_ADJUST_CFA_OFFSET -8
  .endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
        .if vector <> FIRST_EXTERNAL_VECTOR
         CFI_ADJUST_CFA_OFFSET -4
        .endif
-1:     pushl $(~vector+0x80)   /* Note: always in signed byte range */
-       CFI_ADJUST_CFA_OFFSET 4
+1:     pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
        .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
         jmp 2f
        .endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
  #define BUILD_INTERRUPT3(name, nr, fn) \
  ENTRY(name)                            \
         RING0_INT_FRAME;                \
-       pushl $~(nr);                   \
-       CFI_ADJUST_CFA_OFFSET 4;        \
+       pushl_cfi $~(nr);               \
         SAVE_ALL;                       \
         TRACE_IRQS_OFF                  \
         movl %esp,%eax;                 \
@@ -893,21 +857,18 @@ ENDPROC(name)
  
  ENTRY(coprocessor_error)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_coprocessor_error
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_coprocessor_error
         jmp error_code
         CFI_ENDPROC
  END(coprocessor_error)
  
  ENTRY(simd_coprocessor_error)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
  #ifdef CONFIG_X86_INVD_BUG
         /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:   pushl $do_general_protection
+661:   pushl_cfi $do_general_protection
  662:
  .section .altinstructions,"a"
         .balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
  664:
  .previous
  #else
-       pushl $do_simd_coprocessor_error
+       pushl_cfi $do_simd_coprocessor_error
  #endif
-       CFI_ADJUST_CFA_OFFSET 4
         jmp error_code
         CFI_ENDPROC
  END(simd_coprocessor_error)
  
  ENTRY(device_not_available)
         RING0_INT_FRAME
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_device_not_available
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $-1                   # mark this as an int
+       pushl_cfi $do_device_not_available
         jmp error_code
         CFI_ENDPROC
  END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
  
  ENTRY(overflow)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_overflow
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_overflow
         jmp error_code
         CFI_ENDPROC
  END(overflow)
  
  ENTRY(bounds)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_bounds
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_bounds
         jmp error_code
         CFI_ENDPROC
  END(bounds)
  
  ENTRY(invalid_op)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_invalid_op
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_invalid_op
         jmp error_code
         CFI_ENDPROC
  END(invalid_op)
  
  ENTRY(coprocessor_segment_overrun)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_coprocessor_segment_overrun
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_coprocessor_segment_overrun
         jmp error_code
         CFI_ENDPROC
  END(coprocessor_segment_overrun)
  
  ENTRY(invalid_TSS)
         RING0_EC_FRAME
-       pushl $do_invalid_TSS
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_invalid_TSS
         jmp error_code
         CFI_ENDPROC
  END(invalid_TSS)
  
  ENTRY(segment_not_present)
         RING0_EC_FRAME
-       pushl $do_segment_not_present
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_segment_not_present
         jmp error_code
         CFI_ENDPROC
  END(segment_not_present)
  
  ENTRY(stack_segment)
         RING0_EC_FRAME
-       pushl $do_stack_segment
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_stack_segment
         jmp error_code
         CFI_ENDPROC
  END(stack_segment)
  
  ENTRY(alignment_check)
         RING0_EC_FRAME
-       pushl $do_alignment_check
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_alignment_check
         jmp error_code
         CFI_ENDPROC
  END(alignment_check)
  
  ENTRY(divide_error)
         RING0_INT_FRAME
-       pushl $0                        # no error code
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_divide_error
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0                    # no error code
+       pushl_cfi $do_divide_error
         jmp error_code
         CFI_ENDPROC
  END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
  #ifdef CONFIG_X86_MCE
  ENTRY(machine_check)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl machine_check_vector
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi machine_check_vector
         jmp error_code
         CFI_ENDPROC
  END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
  
  ENTRY(spurious_interrupt_bug)
         RING0_INT_FRAME
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $do_spurious_interrupt_bug
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
+       pushl_cfi $do_spurious_interrupt_bug
         jmp error_code
         CFI_ENDPROC
  END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
  
  ENTRY(xen_hypervisor_callback)
         CFI_STARTPROC
-       pushl $0
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $0
         SAVE_ALL
         TRACE_IRQS_OFF
  
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
  # We distinguish between categories by maintaining a status value in EAX.
  ENTRY(xen_failsafe_callback)
         CFI_STARTPROC
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         movl $1,%eax
  1:     mov 4(%esp),%ds
  2:     mov 8(%esp),%es
  3:     mov 12(%esp),%fs
  4:     mov 16(%esp),%gs
         testl %eax,%eax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %eax
         lea 16(%esp),%esp
         CFI_ADJUST_CFA_OFFSET -16
         jz 5f
         addl $16,%esp
         jmp iret_exc            # EAX != 0 => Category 2 (Bad IRET)
-5:     pushl $0                # EAX == 0 => Category 1 (Bad segment)
-       CFI_ADJUST_CFA_OFFSET 4
+5:     pushl_cfi $0            # EAX == 0 => Category 1 (Bad segment)
         SAVE_ALL
         jmp ret_from_exception
         CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
  
  ENTRY(page_fault)
         RING0_EC_FRAME
-       pushl $do_page_fault
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_page_fault
         ALIGN
  error_code:
         /* the function address is in %gs's slot on the stack */
-       pushl %fs
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %fs
         /*CFI_REL_OFFSET fs, 0*/
-       pushl %es
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %es
         /*CFI_REL_OFFSET es, 0*/
-       pushl %ds
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ds
         /*CFI_REL_OFFSET ds, 0*/
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         CFI_REL_OFFSET eax, 0
-       pushl %ebp
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebp
         CFI_REL_OFFSET ebp, 0
-       pushl %edi
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %edi
         CFI_REL_OFFSET edi, 0
-       pushl %esi
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %esi
         CFI_REL_OFFSET esi, 0
-       pushl %edx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %edx
         CFI_REL_OFFSET edx, 0
-       pushl %ecx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ecx
         CFI_REL_OFFSET ecx, 0
-       pushl %ebx
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ebx
         CFI_REL_OFFSET ebx, 0
         cld
         movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
         movl TSS_sysenter_sp0 + \offset(%esp), %esp
         CFI_DEF_CFA esp, 0
         CFI_UNDEFINED eip
-       pushfl
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $__KERNEL_CS
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl $sysenter_past_esp
-       CFI_ADJUST_CFA_OFFSET 4
+       pushfl_cfi
+       pushl_cfi $__KERNEL_CS
+       pushl_cfi $sysenter_past_esp
         CFI_REL_OFFSET eip, 0
  .endm
  
@@ -1377,8 +1299,7 @@ ENTRY(debug)
         jne debug_stack_correct
         FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
  debug_stack_correct:
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $-1                   # mark this as an int
         SAVE_ALL
         TRACE_IRQS_OFF
         xorl %edx,%edx                  # error code 0
@@ -1398,32 +1319,27 @@ END(debug)
   */
  ENTRY(nmi)
         RING0_INT_FRAME
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         movl %ss, %eax
         cmpw $__ESPFIX_SS, %ax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %eax
         je nmi_espfix_stack
         cmpl $ia32_sysenter_target,(%esp)
         je nmi_stack_fixup
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         movl %esp,%eax
         /* Do not access memory above the end of our stack page,
          * it might not exist.
          */
         andl $(THREAD_SIZE-1),%eax
         cmpl $(THREAD_SIZE-20),%eax
-       popl %eax
-       CFI_ADJUST_CFA_OFFSET -4
+       popl_cfi %eax
         jae nmi_stack_correct
         cmpl $ia32_sysenter_target,12(%esp)
         je nmi_debug_stack_check
  nmi_stack_correct:
         /* We have a RING0_INT_FRAME here */
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         SAVE_ALL
         xorl %edx,%edx          # zero error code
         movl %esp,%eax          # pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
          *
          * create the pointer to lss back
          */
-       pushl %ss
-       CFI_ADJUST_CFA_OFFSET 4
-       pushl %esp
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %ss
+       pushl_cfi %esp
         addl $4, (%esp)
         /* copy the iret frame of 12 bytes */
         .rept 3
-       pushl 16(%esp)
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi 16(%esp)
         .endr
-       pushl %eax
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi %eax
         SAVE_ALL
         FIXUP_ESPFIX_STACK              # %eax == %esp
         xorl %edx,%edx                  # zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
  
  ENTRY(int3)
         RING0_INT_FRAME
-       pushl $-1                       # mark this as an int
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $-1                   # mark this as an int
         SAVE_ALL
         TRACE_IRQS_OFF
         xorl %edx,%edx          # zero error code
@@ -1490,8 +1401,7 @@ END(int3)
  
  ENTRY(general_protection)
         RING0_EC_FRAME
-       pushl $do_general_protection
-       CFI_ADJUST_CFA_OFFSET 4
+       pushl_cfi $do_general_protection
         jmp error_code
         CFI_ENDPROC
  END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index c375c79..a7ae7fd 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
         .macro FAKE_STACK_FRAME child_rip
         /* push in order ss, rsp, eflags, cs, rip */
         xorl %eax, %eax
-       pushq $__KERNEL_DS /* ss */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi $__KERNEL_DS /* ss */
         /*CFI_REL_OFFSET        ss,0*/
-       pushq %rax /* rsp */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi %rax /* rsp */
         CFI_REL_OFFSET  rsp,0
-       pushq $X86_EFLAGS_IF /* eflags - interrupts on */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
         /*CFI_REL_OFFSET        rflags,0*/
-       pushq $__KERNEL_CS /* cs */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi $__KERNEL_CS /* cs */
         /*CFI_REL_OFFSET        cs,0*/
-       pushq \child_rip /* rip */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi \child_rip /* rip */
         CFI_REL_OFFSET  rip,0
-       pushq   %rax /* orig rax */
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi %rax /* orig rax */
         .endm
  
         .macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
  
         LOCK ; btr $TIF_FORK,TI_flags(%r8)
  
-       push kernel_eflags(%rip)
-       CFI_ADJUST_CFA_OFFSET 8
-       popf                                    # reset kernel eflags
-       CFI_ADJUST_CFA_OFFSET -8
+       pushq_cfi kernel_eflags(%rip)
+       popfq_cfi                               # reset kernel eflags
  
         call schedule_tail                      # rdi: 'prev' task parameter
  
@@ -521,11 +513,9 @@ sysret_careful:
         jnc sysret_signal
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
+       pushq_cfi %rdi
         call schedule
-       popq  %rdi
-       CFI_ADJUST_CFA_OFFSET -8
+       popq_cfi %rdi
         jmp sysret_check
  
         /* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
         jnc  int_very_careful
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
+       pushq_cfi %rdi
         call schedule
-       popq %rdi
-       CFI_ADJUST_CFA_OFFSET -8
+       popq_cfi %rdi
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
         /* Check for syscall exit trace */
         testl $_TIF_WORK_SYSCALL_EXIT,%edx
         jz int_signal
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET 8
+       pushq_cfi %rdi
         leaq 8(%rsp),%rdi       # &ptregs -> arg1
         call syscall_trace_leave
-       popq %rdi
-       CFI_ADJUST_CFA_OFFSET -8
+       popq_cfi %rdi
         andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
         jmp int_restore_rest
  
@@ -714,9 +700,8 @@ END(ptregscall_common)
  
  ENTRY(stub_execve)
         CFI_STARTPROC
-       popq %r11
-       CFI_ADJUST_CFA_OFFSET -8
-       CFI_REGISTER rip, r11
+       addq $8, %rsp
+       PARTIAL_FRAME 0
         SAVE_REST
         FIXUP_TOP_OF_STACK %r11
         movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
  ENTRY(stub_rt_sigreturn)
         CFI_STARTPROC
         addq $8, %rsp
-       CFI_ADJUST_CFA_OFFSET   -8
+       PARTIAL_FRAME 0
         SAVE_REST
         movq %rsp,%rdi
         FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
        .if vector <> FIRST_EXTERNAL_VECTOR
         CFI_ADJUST_CFA_OFFSET -8
        .endif
-1:     pushq $(~vector+0x80)   /* Note: always in signed byte range */
-       CFI_ADJUST_CFA_OFFSET 8
+1:     pushq_cfi $(~vector+0x80)       /* Note: always in signed byte range */
        .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
         jmp 2f
        .endif
@@ -796,8 +780,8 @@ END(interrupt)
  
  /* 0(%rsp): ~(interrupt number) */
         .macro interrupt func
-       subq $10*8, %rsp
-       CFI_ADJUST_CFA_OFFSET 10*8
+       subq $ORIG_RAX-ARGOFFSET+8, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
         call save_args
         PARTIAL_FRAME 0
         call \func
@@ -822,6 +806,7 @@ ret_from_intr:
         TRACE_IRQS_OFF
         decl PER_CPU_VAR(irq_count)
         leaveq
+       CFI_RESTORE             rbp
         CFI_DEF_CFA_REGISTER    rsp
         CFI_ADJUST_CFA_OFFSET   -8
  exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
         jnc   retint_signal
         TRACE_IRQS_ON
         ENABLE_INTERRUPTS(CLBR_NONE)
-       pushq %rdi
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi %rdi
         call  schedule
-       popq %rdi
-       CFI_ADJUST_CFA_OFFSET   -8
+       popq_cfi %rdi
         GET_THREAD_INFO(%rcx)
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
  .macro apicinterrupt num sym do_sym
  ENTRY(\sym)
         INTR_FRAME
-       pushq $~(\num)
-       CFI_ADJUST_CFA_OFFSET 8
+       pushq_cfi $~(\num)
         interrupt \do_sym
         jmp ret_from_intr
         CFI_ENDPROC
@@ -1036,8 +1018,8 @@ ENTRY(\sym)
         INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
-       subq $15*8,%rsp
-       CFI_ADJUST_CFA_OFFSET 15*8
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call error_entry
         DEFAULT_FRAME 0
         movq %rsp,%rdi          /* pt_regs pointer */
@@ -1052,9 +1034,9 @@ END(\sym)
  ENTRY(\sym)
         INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
-       pushq $-1               /* ORIG_RAX: no syscall to restart */
-       CFI_ADJUST_CFA_OFFSET 8
-       subq $15*8, %rsp
+       pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
         TRACE_IRQS_OFF
         movq %rsp,%rdi          /* pt_regs pointer */
@@ -1070,9 +1052,9 @@ END(\sym)
  ENTRY(\sym)
         INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
-       pushq $-1               /* ORIG_RAX: no syscall to restart */
-       CFI_ADJUST_CFA_OFFSET 8
-       subq $15*8, %rsp
+       pushq_cfi $-1           /* ORIG_RAX: no syscall to restart */
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
         TRACE_IRQS_OFF
         movq %rsp,%rdi          /* pt_regs pointer */
@@ -1089,8 +1071,8 @@ END(\sym)
  ENTRY(\sym)
         XCPT_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
-       subq $15*8,%rsp
-       CFI_ADJUST_CFA_OFFSET 15*8
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call error_entry
         DEFAULT_FRAME 0
         movq %rsp,%rdi                  /* pt_regs pointer */
@@ -1107,8 +1089,8 @@ END(\sym)
  ENTRY(\sym)
         XCPT_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
-       subq $15*8,%rsp
-       CFI_ADJUST_CFA_OFFSET 15*8
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
         DEFAULT_FRAME 0
         TRACE_IRQS_OFF
@@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
         /* edi:  new selector */
  ENTRY(native_load_gs_index)
         CFI_STARTPROC
-       pushf
-       CFI_ADJUST_CFA_OFFSET 8
+       pushfq_cfi
         DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
         SWAPGS
  gs_change:
         movl %edi,%gs
  2:     mfence          /* workaround */
         SWAPGS
-       popf
-       CFI_ADJUST_CFA_OFFSET -8
+       popfq_cfi
         ret
         CFI_ENDPROC
  END(native_load_gs_index)
@@ -1215,8 +1195,7 @@ END(kernel_execve)
  /* Call softirq on interrupt stack. Interrupts are off. */
  ENTRY(call_softirq)
         CFI_STARTPROC
-       push %rbp
-       CFI_ADJUST_CFA_OFFSET   8
+       pushq_cfi %rbp
         CFI_REL_OFFSET rbp,0
         mov  %rsp,%rbp
         CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1204,7 @@ ENTRY(call_softirq)
         push  %rbp                      # backlink for old unwinder
         call __do_softirq
         leaveq
+       CFI_RESTORE             rbp
         CFI_DEF_CFA_REGISTER    rsp
         CFI_ADJUST_CFA_OFFSET   -8
         decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
  
         /* ebx: no swapgs flag */
  ENTRY(paranoid_exit)
-       INTR_FRAME
+       DEFAULT_FRAME
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
         testl %ebx,%ebx                         /* swapgs needed? */
@@ -1445,7 +1425,6 @@ error_swapgs:
  error_sti:
         TRACE_IRQS_OFF
         ret
-       CFI_ENDPROC
  
  /*
   * There are two places in the kernel that can potentially fault with
@@ -1470,6 +1449,7 @@ bstep_iret:
         /* Fix truncated RIP */
         movq %rcx,RIP+8(%rsp)
         jmp error_swapgs
+       CFI_ENDPROC
  END(error_entry)
  
  
@@ -1498,8 +1478,8 @@ ENTRY(nmi)
         INTR_FRAME
         PARAVIRT_ADJUST_EXCEPTION_FRAME
         pushq_cfi $-1
-       subq $15*8, %rsp
-       CFI_ADJUST_CFA_OFFSET 15*8
+       subq $ORIG_RAX-R15, %rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
         DEFAULT_FRAME 0
         /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c

index 7494999..efaf906 100644 (file)
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -440,9 +440,9 @@ static int hpet_legacy_next_event(unsigned long delta,
  static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
  static struct hpet_dev *hpet_devs;
  
-void hpet_msi_unmask(unsigned int irq)
+void hpet_msi_unmask(struct irq_data *data)
  {
-       struct hpet_dev *hdev = get_irq_data(irq);
+       struct hpet_dev *hdev = data->handler_data;
         unsigned int cfg;
  
         /* unmask it */
@@ -451,10 +451,10 @@ void hpet_msi_unmask(unsigned int irq)
         hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
  }
  
-void hpet_msi_mask(unsigned int irq)
+void hpet_msi_mask(struct irq_data *data)
  {
+       struct hpet_dev *hdev = data->handler_data;
         unsigned int cfg;
-       struct hpet_dev *hdev = get_irq_data(irq);
  
         /* mask it */
         cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +462,14 @@ void hpet_msi_mask(unsigned int irq)
         hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
  }
  
-void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
  {
-       struct hpet_dev *hdev = get_irq_data(irq);
-
         hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
         hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
  }
  
-void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
  {
-       struct hpet_dev *hdev = get_irq_data(irq);
-
         msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
         msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
         msg->address_hi = 0;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c

index a46cb35..58bb239 100644 (file)
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
          */
  
         if (!HAVE_HWFP) {
+               /*
+                * Disable xsave as we do not support it if i387
+                * emulation is enabled.
+                */
+               setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+               setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
                 xstate_size = sizeof(struct i387_soft_struct);
                 return;
         }
  
         if (cpu_has_fxsr)
                 xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
         else
                 xstate_size = sizeof(struct i387_fsave_struct);
-#endif
  }
  
-#ifdef CONFIG_X86_64
  /*
   * Called at bootup to set up the initial FPU state that is later cloned
   * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
  
  void __cpuinit fpu_init(void)
  {
-       unsigned long oldcr0 = read_cr0();
-
-       set_in_cr4(X86_CR4_OSFXSR);
-       set_in_cr4(X86_CR4_OSXMMEXCPT);
+       unsigned long cr0;
+       unsigned long cr4_mask = 0;
  
-       write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+       if (cpu_has_fxsr)
+               cr4_mask |= X86_CR4_OSFXSR;
+       if (cpu_has_xmm)
+               cr4_mask |= X86_CR4_OSXMMEXCPT;
+       if (cr4_mask)
+               set_in_cr4(cr4_mask);
+
+       cr0 = read_cr0();
+       cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+       if (!HAVE_HWFP)
+               cr0 |= X86_CR0_EM;
+       write_cr0(cr0);
  
         if (!smp_processor_id())
                 init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
         clear_used_math();
  }
  
-#else  /* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
-       if (!smp_processor_id())
-               init_thread_xstate();
-}
-
-#endif /* CONFIG_X86_32 */
-
  void fpu_finit(struct fpu *fpu)
  {
-#ifdef CONFIG_X86_32
         if (!HAVE_HWFP) {
                 finit_soft_fpu(&fpu->state->soft);
                 return;
         }
-#endif
  
         if (cpu_has_fxsr) {
                 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
  #ifdef CONFIG_X86_64
         env->fip = fxsave->rip;
         env->foo = fxsave->rdp;
+       /*
+        * should be actually ds/cs at fpu exception time, but
+        * that information is not available in 64bit mode.
+        */
+       env->fcs = task_pt_regs(tsk)->cs;
         if (tsk == current) {
-               /*
-                * should be actually ds/cs at fpu exception time, but
-                * that information is not available in 64bit mode.
-                */
-               asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
-               asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+               savesegment(ds, env->fos);
         } else {
-               struct pt_regs *regs = task_pt_regs(tsk);
-
-               env->fos = 0xffff0000 | tsk->thread.ds;
-               env->fcs = regs->cs;
+               env->fos = tsk->thread.ds;
         }
+       env->fos |= 0xffff0000;
  #else
         env->fip = fxsave->fip;
         env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c

index cafa7c8..20757cb 100644 (file)
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
   * plus some generic x86 specific things if generic specifics makes
   * any sense at all.
   */
+static void init_8259A(int auto_eoi);
  
  static int i8259A_auto_eoi;
  DEFINE_RAW_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-static void mask_8259A(void);
-static void unmask_8259A(void);
-static void disable_8259A_irq(unsigned int irq);
-static void enable_8259A_irq(unsigned int irq);
-static void init_8259A(int auto_eoi);
-static int i8259A_irq_pending(unsigned int irq);
-
-struct irq_chip i8259A_chip = {
-       .name           = "XT-PIC",
-       .mask           = disable_8259A_irq,
-       .disable        = disable_8259A_irq,
-       .unmask         = enable_8259A_irq,
-       .mask_ack       = mask_and_ack_8259A,
-};
  
  /*
   * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
   */
  unsigned long io_apic_irqs;
  
-static void disable_8259A_irq(unsigned int irq)
+static void mask_8259A_irq(unsigned int irq)
  {
         unsigned int mask = 1 << irq;
         unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
         raw_spin_unlock_irqrestore(&i8259A_lock, flags);
  }
  
-static void enable_8259A_irq(unsigned int irq)
+static void disable_8259A_irq(struct irq_data *data)
+{
+       mask_8259A_irq(data->irq);
+}
+
+static void unmask_8259A_irq(unsigned int irq)
  {
         unsigned int mask = ~(1 << irq);
         unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
         raw_spin_unlock_irqrestore(&i8259A_lock, flags);
  }
  
+static void enable_8259A_irq(struct irq_data *data)
+{
+       unmask_8259A_irq(data->irq);
+}
+
  static int i8259A_irq_pending(unsigned int irq)
  {
         unsigned int mask = 1<<irq;
@@ -117,7 +113,7 @@ static void make_8259A_irq(unsigned int irq)
         disable_irq_nosync(irq);
         io_apic_irqs &= ~(1<<irq);
         set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-                                     "XT");
+                                     i8259A_chip.name);
         enable_irq(irq);
  }
  
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
   * first, _then_ send the EOI, and the order of EOI
   * to the two 8259s is important!
   */
-static void mask_and_ack_8259A(unsigned int irq)
+static void mask_and_ack_8259A(struct irq_data *data)
  {
+       unsigned int irq = data->irq;
         unsigned int irqmask = 1 << irq;
         unsigned long flags;
  
@@ -223,6 +220,14 @@ spurious_8259A_irq:
         }
  }
  
+struct irq_chip i8259A_chip = {
+       .name           = "XT-PIC",
+       .irq_mask       = disable_8259A_irq,
+       .irq_disable    = disable_8259A_irq,
+       .irq_unmask     = enable_8259A_irq,
+       .irq_mask_ack   = mask_and_ack_8259A,
+};
+
  static char irq_trigger[2];
  /**
   * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
                  * In AEOI mode we just have to mask the interrupt
                  * when acking.
                  */
-               i8259A_chip.mask_ack = disable_8259A_irq;
+               i8259A_chip.irq_mask_ack = disable_8259A_irq;
         else
-               i8259A_chip.mask_ack = mask_and_ack_8259A;
+               i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
  
         udelay(100);            /* wait for 8259A to initialize */
  
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
  static void legacy_pic_noop(void) { };
  static void legacy_pic_uint_noop(unsigned int unused) { };
  static void legacy_pic_int_noop(int unused) { };
-
-static struct irq_chip dummy_pic_chip  = {
-       .name = "dummy pic",
-       .mask = legacy_pic_uint_noop,
-       .unmask = legacy_pic_uint_noop,
-       .disable = legacy_pic_uint_noop,
-       .mask_ack = legacy_pic_uint_noop,
-};
  static int legacy_pic_irq_pending_noop(unsigned int irq)
  {
         return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
  
  struct legacy_pic null_legacy_pic = {
         .nr_legacy_irqs = 0,
-       .chip = &dummy_pic_chip,
+       .chip = &dummy_irq_chip,
+       .mask = legacy_pic_uint_noop,
+       .unmask = legacy_pic_uint_noop,
         .mask_all = legacy_pic_noop,
         .restore_mask = legacy_pic_noop,
         .init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
  struct legacy_pic default_legacy_pic = {
         .nr_legacy_irqs = NR_IRQS_LEGACY,
         .chip  = &i8259A_chip,
-       .mask_all  = mask_8259A,
+       .mask = mask_8259A_irq,
+       .unmask = unmask_8259A_irq,
+       .mask_all = mask_8259A,
         .restore_mask = unmask_8259A,
         .init = init_8259A,
         .irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c

index 44edb03..83ec017 100644 (file)
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
         seq_printf(p, "%*d: ", prec, i);
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-       seq_printf(p, " %8s", desc->chip->name);
+       seq_printf(p, " %8s", desc->irq_data.chip->name);
         seq_printf(p, "-%-8s", desc->name);
  
         if (action) {
@@ -282,6 +282,7 @@ void fixup_irqs(void)
         unsigned int irq, vector;
         static int warned;
         struct irq_desc *desc;
+       struct irq_data *data;
  
         for_each_irq_desc(irq, desc) {
                 int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
                 /* interrupt's are disabled at this point */
                 raw_spin_lock(&desc->lock);
  
-               affinity = desc->affinity;
+               data = &desc->irq_data;
+               affinity = data->affinity;
                 if (!irq_has_action(irq) ||
                     cpumask_equal(affinity, cpu_online_mask)) {
                         raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
                         affinity = cpu_all_mask;
                 }
  
-               if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
-                       desc->chip->mask(irq);
+               if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
+                       data->chip->irq_mask(data);
  
-               if (desc->chip->set_affinity)
-                       desc->chip->set_affinity(irq, affinity);
+               if (data->chip->irq_set_affinity)
+                       data->chip->irq_set_affinity(data, affinity, true);
                 else if (!(warned++))
                         set_affinity = 0;
  
-               if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
-                       desc->chip->unmask(irq);
+               if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
+                       data->chip->irq_unmask(data);
  
                 raw_spin_unlock(&desc->lock);
  
@@ -355,10 +357,10 @@ void fixup_irqs(void)
                 if (irr  & (1 << (vector % 32))) {
                         irq = __get_cpu_var(vector_irq)[vector];
  
-                       desc = irq_to_desc(irq);
+                       data = irq_get_irq_data(irq);
                         raw_spin_lock(&desc->lock);
-                       if (desc->chip->retrigger)
-                               desc->chip->retrigger(irq);
+                       if (data->chip->irq_retrigger)
+                               data->chip->irq_retrigger(data);
                         raw_spin_unlock(&desc->lock);
                 }
         }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c

index 713969b..c752e97 100644 (file)
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
  
  void __init init_ISA_irqs(void)
  {
+       struct irq_chip *chip = legacy_pic->chip;
+       const char *name = chip->name;
         int i;
  
  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
  #endif
         legacy_pic->init(0);
  
-       /*
-        * 16 old-style INTA-cycle interrupts:
-        */
-       for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = NULL;
-               desc->depth = 1;
-
-               set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                             handle_level_irq, "XT");
-       }
+       for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+               set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
  }
  
  void __init init_IRQ(void)
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c

deleted file mode 100644 (file)

index 0f7bc20..0000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Shared support code for AMD K8 northbridges and derivates.
- * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
- */
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/k8.h>
-
-int num_k8_northbridges;
-EXPORT_SYMBOL(num_k8_northbridges);
-
-static u32 *flush_words;
-
-struct pci_device_id k8_nb_ids[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
-       {}
-};
-EXPORT_SYMBOL(k8_nb_ids);
-
-struct pci_dev **k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
-
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
-{
-       do {
-               dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-               if (!dev)
-                       break;
-       } while (!pci_match_id(&k8_nb_ids[0], dev));
-       return dev;
-}
-
-int cache_k8_northbridges(void)
-{
-       int i;
-       struct pci_dev *dev;
-
-       if (num_k8_northbridges)
-               return 0;
-
-       dev = NULL;
-       while ((dev = next_k8_northbridge(dev)) != NULL)
-               num_k8_northbridges++;
-
-       k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
-                                 GFP_KERNEL);
-       if (!k8_northbridges)
-               return -ENOMEM;
-
-       if (!num_k8_northbridges) {
-               k8_northbridges[0] = NULL;
-               return 0;
-       }
-
-       flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
-       if (!flush_words) {
-               kfree(k8_northbridges);
-               return -ENOMEM;
-       }
-
-       dev = NULL;
-       i = 0;
-       while ((dev = next_k8_northbridge(dev)) != NULL) {
-               k8_northbridges[i] = dev;
-               pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-       }
-       k8_northbridges[i] = NULL;
-       return 0;
-}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
-
-/* Ignores subdevice/subvendor but as far as I can figure out
-   they're useless anyways */
-int __init early_is_k8_nb(u32 device)
-{
-       struct pci_device_id *id;
-       u32 vendor = device & 0xffff;
-       device >>= 16;
-       for (id = k8_nb_ids; id->vendor; id++)
-               if (vendor == id->vendor && device == id->device)
-                       return 1;
-       return 0;
-}
-
-void k8_flush_garts(void)
-{
-       int flushed, i;
-       unsigned long flags;
-       static DEFINE_SPINLOCK(gart_lock);
-
-       /* Avoid races between AGP and IOMMU. In theory it's not needed
-          but I'm not sure if the hardware won't lose flush requests
-          when another is pending. This whole thing is so expensive anyways
-          that it doesn't matter to serialize more. -AK */
-       spin_lock_irqsave(&gart_lock, flags);
-       flushed = 0;
-       for (i = 0; i < num_k8_northbridges; i++) {
-               pci_write_config_dword(k8_northbridges[i], 0x9c,
-                                      flush_words[i]|1);
-               flushed++;
-       }
-       for (i = 0; i < num_k8_northbridges; i++) {
-               u32 w;
-               /* Make sure the hardware actually executed the flush*/
-               for (;;) {
-                       pci_read_config_dword(k8_northbridges[i],
-                                             0x9c, &w);
-                       if (!(w & 1))
-                               break;
-                       cpu_relax();
-               }
-       }
-       spin_unlock_irqrestore(&gart_lock, flags);
-       if (!flushed)
-               printk("nothing to flush?\n");
-}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
-
-static __init int init_k8_nbs(void)
-{
-       int err = 0;
-
-       err = cache_k8_northbridges();
-
-       if (err < 0)
-               printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
-
-       return err;
-}
-
-/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c

index 035c8c5..b3ea9db 100644 (file)
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                 if (!page)
                         goto out;
                 pud = (pud_t *)page_address(page);
-               memset(pud, 0, PAGE_SIZE);
+               clear_page(pud);
                 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
         }
         pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                 if (!page)
                         goto out;
                 pmd = (pmd_t *)page_address(page);
-               memset(pmd, 0, PAGE_SIZE);
+               clear_page(pmd);
                 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
         }
         pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c

new file mode 100644 (file)

index 0000000..f5442c0
--- /dev/null
+++ b/arch/x86/kernel/olpc-xo1.c
@@ -0,0 +1,140 @@
+/*
+ * Support for features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1"
+
+#define PMS_BAR                4
+#define ACPI_BAR       5
+
+/* PMC registers (PMS block) */
+#define PM_SCLK                0x10
+#define PM_IN_SLPCTL   0x20
+#define PM_WKXD                0x34
+#define PM_WKD         0x30
+#define PM_SSC         0x54
+
+/* PM registers (ACPI block) */
+#define PM1_CNT                0x08
+#define PM_GPE0_STS    0x18
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static void xo1_power_off(void)
+{
+       printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+       /* Enable all of these controls with 0 delay */
+       outl(0x40000000, pms_base + PM_SCLK);
+       outl(0x40000000, pms_base + PM_IN_SLPCTL);
+       outl(0x40000000, pms_base + PM_WKXD);
+       outl(0x40000000, pms_base + PM_WKD);
+
+       /* Clear status bits (possibly unnecessary) */
+       outl(0x0002ffff, pms_base  + PM_SSC);
+       outl(0xffffffff, acpi_base + PM_GPE0_STS);
+
+       /* Write SLP_EN bit to start the machinery */
+       outl(0x00002000, acpi_base + PM1_CNT);
+}
+
+/* Read the base addresses from the PCI BAR info */
+static int __devinit setup_bases(struct pci_dev *pdev)
+{
+       int r;
+
+       r = pci_enable_device_io(pdev);
+       if (r) {
+               dev_err(&pdev->dev, "can't enable device IO\n");
+               return r;
+       }
+
+       r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
+       if (r) {
+               dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
+               return r;
+       }
+
+       r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
+       if (r) {
+               dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
+               pci_release_region(pdev, ACPI_BAR);
+               return r;
+       }
+
+       acpi_base = pci_resource_start(pdev, ACPI_BAR);
+       pms_base = pci_resource_start(pdev, PMS_BAR);
+
+       return 0;
+}
+
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+{
+       struct pci_dev *pcidev;
+       int r;
+
+       pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
+                               NULL);
+       if (!pdev)
+               return -ENODEV;
+
+       r = setup_bases(pcidev);
+       if (r)
+               return r;
+
+       pm_power_off = xo1_power_off;
+
+       printk(KERN_INFO "OLPC XO-1 support registered\n");
+       return 0;
+}
+
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
+{
+       pm_power_off = NULL;
+       return 0;
+}
+
+static struct platform_driver olpc_xo1_driver = {
+       .driver = {
+               .name = DRV_NAME,
+               .owner = THIS_MODULE,
+       },
+       .probe = olpc_xo1_probe,
+       .remove = __devexit_p(olpc_xo1_remove),
+};
+
+static int __init olpc_xo1_init(void)
+{
+       return platform_driver_register(&olpc_xo1_driver);
+}
+
+static void __exit olpc_xo1_exit(void)
+{
+       platform_driver_unregister(&olpc_xo1_driver);
+}
+
+MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:olpc-xo1");
+
+module_init(olpc_xo1_init);
+module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c

index 0e0cdde..edaf3fe 100644 (file)
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,6 +17,7 @@
  #include <linux/spinlock.h>
  #include <linux/io.h>
  #include <linux/string.h>
+#include <linux/platform_device.h>
  
  #include <asm/geode.h>
  #include <asm/setup.h>
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
         unsigned long flags;
         int ret = -EIO;
         int i;
+       int restarts = 0;
  
         spin_lock_irqsave(&ec_lock, flags);
  
@@ -169,7 +171,9 @@ restart:
                         if (wait_on_obf(0x6c, 1)) {
                                 printk(KERN_ERR "olpc-ec:  timeout waiting for"
                                                 " EC to provide data!\n");
-                               goto restart;
+                               if (restarts++ < 10)
+                                       goto restart;
+                               goto err;
                         }
                         outbuf[i] = inb(0x68);
                         pr_devel("olpc-ec:  received 0x%x\n", outbuf[i]);
@@ -183,8 +187,21 @@ err:
  }
  EXPORT_SYMBOL_GPL(olpc_ec_cmd);
  
-#ifdef CONFIG_OLPC_OPENFIRMWARE
-static void __init platform_detect(void)
+static bool __init check_ofw_architecture(void)
+{
+       size_t propsize;
+       char olpc_arch[5];
+       const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
+       void *res[] = { &propsize };
+
+       if (olpc_ofw("getprop", args, res)) {
+               printk(KERN_ERR "ofw: getprop call failed!\n");
+               return false;
+       }
+       return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
+}
+
+static u32 __init get_board_revision(void)
  {
         size_t propsize;
         __be32 rev;
@@ -193,45 +210,43 @@ static void __init platform_detect(void)
  
         if (olpc_ofw("getprop", args, res) || propsize != 4) {
                 printk(KERN_ERR "ofw: getprop call failed!\n");
-               rev = cpu_to_be32(0);
+               return cpu_to_be32(0);
         }
-       olpc_platform_info.boardrev = be32_to_cpu(rev);
+       return be32_to_cpu(rev);
  }
-#else
-static void __init platform_detect(void)
+
+static bool __init platform_detect(void)
  {
-       /* stopgap until OFW support is added to the kernel */
-       olpc_platform_info.boardrev = olpc_board(0xc2);
+       if (!check_ofw_architecture())
+               return false;
+       olpc_platform_info.flags |= OLPC_F_PRESENT;
+       olpc_platform_info.boardrev = get_board_revision();
+       return true;
  }
-#endif
  
-static int __init olpc_init(void)
+static int __init add_xo1_platform_devices(void)
  {
-       unsigned char *romsig;
+       struct platform_device *pdev;
  
-       /* The ioremap check is dangerous; limit what we run it on */
-       if (!is_geode() || cs5535_has_vsa2())
-               return 0;
+       pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
  
-       spin_lock_init(&ec_lock);
+       pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
  
-       romsig = ioremap(0xffffffc0, 16);
-       if (!romsig)
-               return 0;
+       return 0;
+}
  
-       if (strncmp(romsig, "CL1   Q", 7))
-               goto unmap;
-       if (strncmp(romsig+6, romsig+13, 3)) {
-               printk(KERN_INFO "OLPC BIOS signature looks invalid.  "
-                               "Assuming not OLPC\n");
-               goto unmap;
-       }
+static int __init olpc_init(void)
+{
+       int r = 0;
  
-       printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
-       olpc_platform_info.flags |= OLPC_F_PRESENT;
+       if (!olpc_ofw_present() || !platform_detect())
+               return 0;
  
-       /* get the platform revision */
-       platform_detect();
+       spin_lock_init(&ec_lock);
  
         /* assume B1 and above models always have a DCON */
         if (olpc_board_at_least(olpc_board(0xb1)))
@@ -242,8 +257,10 @@ static int __init olpc_init(void)
                         (unsigned char *) &olpc_platform_info.ecver, 1);
  
  #ifdef CONFIG_PCI_OLPC
-       /* If the VSA exists let it emulate PCI, if not emulate in kernel */
-       if (!cs5535_has_vsa2())
+       /* If the VSA exists let it emulate PCI, if not emulate in kernel.
+        * XO-1 only. */
+       if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
+                       !cs5535_has_vsa2())
                 x86_init.pci.arch_init = pci_olpc_init;
  #endif
  
@@ -252,8 +269,12 @@ static int __init olpc_init(void)
                         olpc_platform_info.boardrev >> 4,
                         olpc_platform_info.ecver);
  
-unmap:
-       iounmap(romsig);
+       if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
+               r = add_xo1_platform_devices();
+               if (r)
+                       return r;
+       }
+
         return 0;
  }
  
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c

index 3218aa7..7873204 100644 (file)
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
  }
  EXPORT_SYMBOL_GPL(__olpc_ofw);
  
+bool olpc_ofw_present(void)
+{
+       return olpc_ofw_cif != NULL;
+}
+EXPORT_SYMBOL_GPL(olpc_ofw_present);
+
  /* OFW cif _should_ be above this address */
  #define OFW_MIN 0xff000000
  
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c

index 1db183e..c5b2500 100644 (file)
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
  
         .alloc_pte = paravirt_nop,
         .alloc_pmd = paravirt_nop,
-       .alloc_pmd_clone = paravirt_nop,
         .alloc_pud = paravirt_nop,
         .release_pte = paravirt_nop,
         .release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index 0f7f130..c562207 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,7 +39,7 @@
  #include <asm/cacheflush.h>
  #include <asm/swiotlb.h>
  #include <asm/dma.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  #include <asm/x86_init.h>
  
  static unsigned long iommu_bus_base;   /* GART remapping area (physical) */
@@ -560,8 +560,11 @@ static void enable_gart_translations(void)
  {
         int i;
  
-       for (i = 0; i < num_k8_northbridges; i++) {
-               struct pci_dev *dev = k8_northbridges[i];
+       if (!k8_northbridges.gart_supported)
+               return;
+
+       for (i = 0; i < k8_northbridges.num; i++) {
+               struct pci_dev *dev = k8_northbridges.nb_misc[i];
  
                 enable_gart_translation(dev, __pa(agp_gatt_table));
         }
@@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
         if (!fix_up_north_bridges)
                 return;
  
+       if (!k8_northbridges.gart_supported)
+               return;
+
         pr_info("PCI-DMA: Restoring GART aperture settings\n");
  
-       for (i = 0; i < num_k8_northbridges; i++) {
-               struct pci_dev *dev = k8_northbridges[i];
+       for (i = 0; i < k8_northbridges.num; i++) {
+               struct pci_dev *dev = k8_northbridges.nb_misc[i];
  
                 /*
                  * Don't enable translations just yet.  That is the next
                  * step.  Restore the pre-suspend aperture settings.
                  */
-               pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
+               gart_set_size_and_enable(dev, aperture_order);
                 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
         }
  }
@@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
  
         aper_size = aper_base = info->aper_size = 0;
         dev = NULL;
-       for (i = 0; i < num_k8_northbridges; i++) {
-               dev = k8_northbridges[i];
+       for (i = 0; i < k8_northbridges.num; i++) {
+               dev = k8_northbridges.nb_misc[i];
                 new_aper_base = read_aperture(dev, &new_aper_size);
                 if (!new_aper_base)
                         goto nommu;
@@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void)
         if (!no_agp)
                 return;
  
-       for (i = 0; i < num_k8_northbridges; i++) {
+       if (!k8_northbridges.gart_supported)
+               return;
+
+       for (i = 0; i < k8_northbridges.num; i++) {
                 u32 ctl;
  
-               dev = k8_northbridges[i];
+               dev = k8_northbridges.nb_misc[i];
                 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
  
                 ctl &= ~GARTEN;
@@ -739,7 +748,7 @@ int __init gart_iommu_init(void)
         unsigned long scratch;
         long i;
  
-       if (num_k8_northbridges == 0)
+       if (!k8_northbridges.gart_supported)
                 return 0;
  
  #ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c

deleted file mode 100644 (file)

index b112406..0000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <linux/acpi_pmtmr.h>
-
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-static inline u32 cyc2us(u32 cycles)
-{
-       /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-        * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-        *
-        * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-        * easily be multiplied with 286 (=0x11E) without having to fear
-        * u32 overflows.
-        */
-       cycles *= 286;
-       return (cycles >> 10);
-}
-
-static unsigned pmtimer_wait_tick(void)
-{
-       u32 a, b;
-       for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
-            a == b;
-            b = inl(pmtmr_ioport) & ACPI_PM_MASK)
-               cpu_relax();
-       return b;
-}
-
-/* note: wait time is rounded up to one tick */
-void pmtimer_wait(unsigned us)
-{
-       u32 a, b;
-       a = pmtimer_wait_tick();
-       do {
-               b = inl(pmtmr_ioport);
-               cpu_relax();
-       } while (cyc2us(b - a) < us);
-}
-
-static int __init nopmtimer_setup(char *s)
-{
-       pmtmr_ioport = 0;
-       return 1;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index 3d9ea53..b3d7a3a 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         load_TLS(next, cpu);
  
         /* Must be after DS reload */
-       unlazy_fpu(prev_p);
+       __unlazy_fpu(prev_p);
  
         /* Make sure cpu is ready for new context */
         if (preload_fpu)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index e3af342..7a4cf14 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
                         }
                                 /* we will leave sorting out the final value
                                    when we are ready to reboot, since we might not
-                                  have set up boot_cpu_id or smp_num_cpu */
+                                  have detected BSP APIC ID or smp_num_cpu */
                         break;
  #endif /* CONFIG_SMP */
  
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index 00e1678..a59f6a6 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
  #include <asm/dmi.h>
  #include <asm/io_apic.h>
  #include <asm/ist.h>
-#include <asm/vmi.h>
  #include <asm/setup_arch.h>
  #include <asm/bios_ebda.h>
  #include <asm/cacheflush.h>
@@ -107,7 +106,7 @@
  #include <asm/percpu.h>
  #include <asm/topology.h>
  #include <asm/apicdef.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  #ifdef CONFIG_X86_64
  #include <asm/numa_64.h>
  #endif
@@ -126,7 +125,6 @@ unsigned long max_pfn_mapped;
  RESERVE_BRK(dmi_alloc, 65536);
  #endif
  
-unsigned int boot_cpu_id __read_mostly;
  
  static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
  unsigned long _brk_end = (unsigned long)__brk_base;
@@ -619,79 +617,7 @@ static __init void reserve_ibft_region(void)
                 reserve_early_overlap_ok(addr, addr + size, "ibft");
  }
  
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
-{
-       printk(KERN_NOTICE
-               "%s detected: BIOS may corrupt low RAM, working around it.\n",
-               d->ident);
-
-       e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
-       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-
-       return 0;
-}
-#endif
-
-/* List of systems that have known low memory corruption BIOS problems */
-static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
-#ifdef CONFIG_X86_RESERVE_LOW_64K
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "AMI BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
-               },
-       },
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "Phoenix BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
-               },
-       },
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "Phoenix/MSC BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
-               },
-       },
-       /*
-        * AMI BIOS with low memory corruption was found on Intel DG45ID and
-        * DG45FC boards.
-        * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
-        * match only DMI_BOARD_NAME and see if there is more bad products
-        * with this vendor.
-        */
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "AMI BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
-               },
-       },
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "AMI BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
-               },
-       },
-       /*
-        * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
-        * match on the product name.
-        */
-       {
-               .callback = dmi_low_memory_corruption,
-               .ident = "Phoenix BIOS",
-               .matches = {
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
-               },
-       },
-#endif
-       {}
-};
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
  
  static void __init trim_bios_range(void)
  {
@@ -699,8 +625,14 @@ static void __init trim_bios_range(void)
          * A special case is the first 4Kb of memory;
          * This is a BIOS owned area, not kernel ram, but generally
          * not listed as such in the E820 table.
+        *
+        * This typically reserves additional memory (64KiB by default)
+        * since some BIOSes are known to corrupt low memory.  See the
+        * Kconfig help text for X86_RESERVE_LOW.
          */
-       e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
+       e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+                         E820_RAM, E820_RESERVED);
+
         /*
          * special case: Some BIOSen report the PC BIOS
          * area (640->1Mb) as ram even though it is not.
@@ -710,6 +642,28 @@ static void __init trim_bios_range(void)
         sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
  }
  
+static int __init parse_reservelow(char *p)
+{
+       unsigned long long size;
+
+       if (!p)
+               return -EINVAL;
+
+       size = memparse(p, &p);
+
+       if (size < 4096)
+               size = 4096;
+
+       if (size > 640*1024)
+               size = 640*1024;
+
+       reserve_low = size;
+
+       return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+
  /*
   * Determine if we were loaded by an EFI loader.  If so, then we have also been
   * passed the efi memmap, systab, etc., so we should use these data structures
@@ -736,10 +690,10 @@ void __init setup_arch(char **cmdline_p)
         printk(KERN_INFO "Command line: %s\n", boot_command_line);
  #endif
  
-       /* VMI may relocate the fixmap; do this before touching ioremap area */
-       vmi_init();
-
-       /* OFW also may relocate the fixmap */
+       /*
+        * If we have OLPC OFW, we might end up relocating the fixmap due to
+        * reserve_top(), so do this before touching the ioremap area.
+        */
         olpc_ofw_detect();
  
         early_trap_init();
@@ -840,9 +794,6 @@ void __init setup_arch(char **cmdline_p)
  
         x86_report_nx();
  
-       /* Must be before kernel pagetables are setup */
-       vmi_activate();
-
         /* after early param, so could get panic from serial */
         reserve_early_setup_data();
  
@@ -865,8 +816,6 @@ void __init setup_arch(char **cmdline_p)
  
         dmi_scan_machine();
  
-       dmi_check_system(bad_bios_dmi_table);
-
         /*
          * VMware detection requires dmi to be available, so this
          * needs to be done after dmi_scan_machine, for the BP.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c

index a60df9a..2335c15 100644 (file)
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void)
                  * Up to this point, the boot CPU has been using .init.data
                  * area.  Reload any changed state for the boot CPU.
                  */
-               if (cpu == boot_cpu_id)
+               if (!cpu)
                         switch_to_new_gdt(cpu);
         }
  
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c

index cb22acf..dd4c281 100644 (file)
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -34,7 +34,7 @@
  #ifdef CONFIG_X86_LOCAL_APIC
  static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
  
-void __init mp_sfi_register_lapic_address(unsigned long address)
+static void __init mp_sfi_register_lapic_address(unsigned long address)
  {
         mp_lapic_addr = address;
  
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
  }
  
  /* All CPUs enumerated by SFI must be present and enabled */
-void __cpuinit mp_sfi_register_lapic(u8 id)
+static void __cpuinit mp_sfi_register_lapic(u8 id)
  {
         if (MAX_APICS - id <= 0) {
                 pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 8b3bfc4..dfb5089 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
  #include <asm/mtrr.h>
-#include <asm/vmi.h>
+#include <asm/mwait.h>
  #include <asm/apic.h>
  #include <asm/setup.h>
  #include <asm/uv/uv.h>
@@ -311,7 +311,6 @@ notrace static void __cpuinit start_secondary(void *unused)
         __flush_tlb_all();
  #endif
  
-       vmi_bringup();
         cpu_init();
         preempt_disable();
         smp_callin();
@@ -324,9 +323,9 @@ notrace static void __cpuinit start_secondary(void *unused)
         check_tsc_sync_target();
  
         if (nmi_watchdog == NMI_IO_APIC) {
-               legacy_pic->chip->mask(0);
+               legacy_pic->mask(0);
                 enable_NMI_through_LVT0();
-               legacy_pic->chip->unmask(0);
+               legacy_pic->unmask(0);
         }
  
         /* This must be done before setting cpu_online_mask */
@@ -397,6 +396,19 @@ void __cpuinit smp_store_cpu_info(int id)
                 identify_secondary_cpu(c);
  }
  
+static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+{
+       struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
+       struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
+
+       cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
+       cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
+       cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
+       cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
+       cpumask_set_cpu(cpu1, c2->llc_shared_map);
+       cpumask_set_cpu(cpu2, c1->llc_shared_map);
+}
+
  
  void __cpuinit set_cpu_sibling_map(int cpu)
  {
@@ -409,14 +421,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
                 for_each_cpu(i, cpu_sibling_setup_mask) {
                         struct cpuinfo_x86 *o = &cpu_data(i);
  
-                       if (c->phys_proc_id == o->phys_proc_id &&
-                           c->cpu_core_id == o->cpu_core_id) {
-                               cpumask_set_cpu(i, cpu_sibling_mask(cpu));
-                               cpumask_set_cpu(cpu, cpu_sibling_mask(i));
-                               cpumask_set_cpu(i, cpu_core_mask(cpu));
-                               cpumask_set_cpu(cpu, cpu_core_mask(i));
-                               cpumask_set_cpu(i, c->llc_shared_map);
-                               cpumask_set_cpu(cpu, o->llc_shared_map);
+                       if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+                               if (c->phys_proc_id == o->phys_proc_id &&
+                                   c->compute_unit_id == o->compute_unit_id)
+                                       link_thread_siblings(cpu, i);
+                       } else if (c->phys_proc_id == o->phys_proc_id &&
+                                  c->cpu_core_id == o->cpu_core_id) {
+                               link_thread_siblings(cpu, i);
                         }
                 }
         } else {
@@ -1109,8 +1120,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
         }
         set_cpu_sibling_map(0);
  
-       enable_IR_x2apic();
-       default_setup_apic_routing();
  
         if (smp_sanity_check(max_cpus) < 0) {
                 printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1127,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
                 goto out;
         }
  
+       default_setup_apic_routing();
+
         preempt_disable();
         if (read_apic_id() != boot_cpu_physical_apicid) {
                 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1383,11 +1394,88 @@ void play_dead_common(void)
         local_irq_disable();
  }
  
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
+static inline void mwait_play_dead(void)
+{
+       unsigned int eax, ebx, ecx, edx;
+       unsigned int highest_cstate = 0;
+       unsigned int highest_subcstate = 0;
+       int i;
+       void *mwait_ptr;
+
+       if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+               return;
+       if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+               return;
+       if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+               return;
+
+       eax = CPUID_MWAIT_LEAF;
+       ecx = 0;
+       native_cpuid(&eax, &ebx, &ecx, &edx);
+
+       /*
+        * eax will be 0 if EDX enumeration is not valid.
+        * Initialized below to cstate, sub_cstate value when EDX is valid.
+        */
+       if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
+               eax = 0;
+       } else {
+               edx >>= MWAIT_SUBSTATE_SIZE;
+               for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
+                       if (edx & MWAIT_SUBSTATE_MASK) {
+                               highest_cstate = i;
+                               highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
+                       }
+               }
+               eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
+                       (highest_subcstate - 1);
+       }
+
+       /*
+        * This should be a memory location in a cache line which is
+        * unlikely to be touched by other processors.  The actual
+        * content is immaterial as it is not actually modified in any way.
+        */
+       mwait_ptr = &current_thread_info()->flags;
+
+       wbinvd();
+
+       while (1) {
+               /*
+                * The CLFLUSH is a workaround for erratum AAI65 for
+                * the Xeon 7400 series.  It's not clear it is actually
+                * needed, but it should be harmless in either case.
+                * The WBINVD is insufficient due to the spurious-wakeup
+                * case where we return around the loop.
+                */
+               clflush(mwait_ptr);
+               __monitor(mwait_ptr, 0, 0);
+               mb();
+               __mwait(eax, 0);
+       }
+}
+
+static inline void hlt_play_dead(void)
+{
+       if (current_cpu_data.x86 >= 4)
+               wbinvd();
+
+       while (1) {
+               native_halt();
+       }
+}
+
  void native_play_dead(void)
  {
         play_dead_common();
         tboot_shutdown(TB_SHUTDOWN_WFS);
-       wbinvd_halt();
+
+       mwait_play_dead();      /* Only returns on failure */
+       hlt_play_dead();
  }
  
  #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c

index d5e0662..0b0cb5f 100644 (file)
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
                   const char *const envp[])
  {
         long __res;
-       asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
+       asm volatile ("int $0x80"
         : "=a" (__res)
-       : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
+       : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
         return __res;
  }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index 60788de..d439685 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
  }
  EXPORT_SYMBOL_GPL(math_state_restore);
  
-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
-       printk(KERN_EMERG
-               "math-emulation not enabled and no coprocessor found.\n");
-       printk(KERN_EMERG "killing %s.\n", current->comm);
-       force_sig(SIGFPE, current);
-       schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
  dotraplinkage void __kprobes
  do_device_not_available(struct pt_regs *regs, long error_code)
  {
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
         if (read_cr0() & X86_CR0_EM) {
                 struct math_emu_info info = { };
  
@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
  
                 info.regs = regs;
                 math_emulate(&info);
-       } else {
-               math_state_restore(); /* interrupts still off */
-               conditional_sti(regs);
+               return;
         }
-#else
-       math_state_restore();
+#endif
+       math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+       conditional_sti(regs);
  #endif
  }
  
@@ -881,18 +870,6 @@ void __init trap_init(void)
  #endif
  
  #ifdef CONFIG_X86_32
-       if (cpu_has_fxsr) {
-               printk(KERN_INFO "Enabling fast FPU save and restore... ");
-               set_in_cr4(X86_CR4_OSFXSR);
-               printk("done.\n");
-       }
-       if (cpu_has_xmm) {
-               printk(KERN_INFO
-                       "Enabling unmasked SIMD FPU exception support... ");
-               set_in_cr4(X86_CR4_OSXMMEXCPT);
-               printk("done.\n");
-       }
-
         set_system_trap_gate(SYSCALL_VECTOR, &system_call);
         set_bit(SYSCALL_VECTOR, used_vectors);
  #endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c

index 26a863a..0c40d8b 100644 (file)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
  
  __setup("notsc", notsc_setup);
  
+static int no_sched_irq_time;
+
  static int __init tsc_setup(char *str)
  {
         if (!strcmp(str, "reliable"))
                 tsc_clocksource_reliable = 1;
+       if (!strncmp(str, "noirqtime", 9))
+               no_sched_irq_time = 1;
         return 1;
  }
  
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
         if (!tsc_unstable) {
                 tsc_unstable = 1;
                 sched_clock_stable = 0;
+               disable_sched_clock_irqtime();
                 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
                 /* Change only the rating, when not registered */
                 if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
         clocksource_register_khz(&clocksource_tsc, tsc_khz);
  }
  
-#ifdef CONFIG_X86_64
-/*
- * calibrate_cpu is used on systems with fixed rate TSCs to determine
- * processor frequency
- */
-#define TICK_COUNT 100000000
-static unsigned long __init calibrate_cpu(void)
-{
-       int tsc_start, tsc_now;
-       int i, no_ctr_free;
-       unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
-       unsigned long flags;
-
-       for (i = 0; i < 4; i++)
-               if (avail_to_resrv_perfctr_nmi_bit(i))
-                       break;
-       no_ctr_free = (i == 4);
-       if (no_ctr_free) {
-               WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
-                    "cpu_khz value may be incorrect.\n");
-               i = 3;
-               rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               rdmsrl(MSR_K7_PERFCTR3, pmc3);
-       } else {
-               reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-       local_irq_save(flags);
-       /* start measuring cycles, incrementing from 0 */
-       wrmsrl(MSR_K7_PERFCTR0 + i, 0);
-       wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
-       rdtscl(tsc_start);
-       do {
-               rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
-               tsc_now = get_cycles();
-       } while ((tsc_now - tsc_start) < TICK_COUNT);
-
-       local_irq_restore(flags);
-       if (no_ctr_free) {
-               wrmsrl(MSR_K7_EVNTSEL3, 0);
-               wrmsrl(MSR_K7_PERFCTR3, pmc3);
-               wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
-       } else {
-               release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-               release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-       }
-
-       return pmc_now * tsc_khz / (tsc_now - tsc_start);
-}
-#else
-static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
-#endif
-
  void __init tsc_init(void)
  {
         u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
                 return;
         }
  
-       if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
-                       (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
-               cpu_khz = calibrate_cpu();
-
         printk("Detected %lu.%03lu MHz processor.\n",
                         (unsigned long)cpu_khz / 1000,
                         (unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
         /* now allow native_sched_clock() to use rdtsc */
         tsc_disabled = 0;
  
+       if (!no_sched_irq_time)
+               enable_sched_clock_irqtime();
+
         lpj = ((u64)tsc_khz * 1000);
         do_div(lpj, HZ);
         lpj_fine = lpj;
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c

index 1132129..7b24460 100644 (file)
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
  static spinlock_t              uv_irq_lock;
  static struct rb_root          uv_irq_root;
  
-static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
+static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
  
-static void uv_noop(unsigned int irq)
-{
-}
-
-static unsigned int uv_noop_ret(unsigned int irq)
-{
-       return 0;
-}
+static void uv_noop(struct irq_data *data) { }
  
-static void uv_ack_apic(unsigned int irq)
+static void uv_ack_apic(struct irq_data *data)
  {
         ack_APIC_irq();
  }
  
  static struct irq_chip uv_irq_chip = {
-       .name           = "UV-CORE",
-       .startup        = uv_noop_ret,
-       .shutdown       = uv_noop,
-       .enable         = uv_noop,
-       .disable        = uv_noop,
-       .ack            = uv_noop,
-       .mask           = uv_noop,
-       .unmask         = uv_noop,
-       .eoi            = uv_ack_apic,
-       .end            = uv_noop,
-       .set_affinity   = uv_set_irq_affinity,
+       .name                   = "UV-CORE",
+       .irq_mask               = uv_noop,
+       .irq_unmask             = uv_noop,
+       .irq_eoi                = uv_ack_apic,
+       .irq_set_affinity       = uv_set_irq_affinity,
  };
  
  /*
@@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                        unsigned long mmr_offset, int limit)
  {
         const struct cpumask *eligible_cpu = cpumask_of(cpu);
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
-       int mmr_pnode;
+       struct irq_cfg *cfg = get_irq_chip_data(irq);
         unsigned long mmr_value;
         struct uv_IO_APIC_route_entry *entry;
-       int err;
+       int mmr_pnode, err;
  
         BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
                         sizeof(unsigned long));
  
-       cfg = irq_cfg(irq);
-
         err = assign_irq_vector(irq, cfg, eligible_cpu);
         if (err != 0)
                 return err;
  
         if (limit == UV_AFFINITY_CPU)
-               desc->status |= IRQ_NO_BALANCING;
+               irq_set_status_flags(irq, IRQ_NO_BALANCING);
         else
-               desc->status |= IRQ_MOVE_PCNTXT;
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
  
         set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
                                       irq_name);
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
         uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
  }
  
-static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+                   bool force)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg = desc->chip_data;
+       struct irq_cfg *cfg = data->chip_data;
         unsigned int dest;
-       unsigned long mmr_value;
+       unsigned long mmr_value, mmr_offset;
         struct uv_IO_APIC_route_entry *entry;
-       unsigned long mmr_offset;
         int mmr_pnode;
  
-       if (set_desc_affinity(desc, mask, &dest))
+       if (__ioapic_set_affinity(data, mask, &dest))
                 return -1;
  
         mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
         entry->dest             = dest;
  
         /* Get previously stored MMR and pnode of hub sourcing interrupts */
-       if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
+       if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
                 return -1;
  
         uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c

index e680ea5..3371bd0 100644 (file)
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
  }
  
  /* Replaces the default init_ISA_irqs in the generic setup */
-static void __init visws_pre_intr_init(void)
-{
-       init_VISWS_APIC_irqs();
-}
+static void __init visws_pre_intr_init(void);
  
  /* Quirk for machine specific memory setup. */
  
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
  /*
   * This is the SGI Cobalt (IO-)APIC:
   */
-
-static void enable_cobalt_irq(unsigned int irq)
+static void enable_cobalt_irq(struct irq_data *data)
  {
-       co_apic_set(is_co_apic(irq), irq);
+       co_apic_set(is_co_apic(data->irq), data->irq);
  }
  
-static void disable_cobalt_irq(unsigned int irq)
+static void disable_cobalt_irq(struct irq_data *data)
  {
-       int entry = is_co_apic(irq);
+       int entry = is_co_apic(data->irq);
  
         co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
         co_apic_read(CO_APIC_LO(entry));
  }
  
-/*
- * "irq" really just serves to identify the device.  Here is where we
- * map this to the Cobalt APIC entry where it's physically wired.
- * This is called via request_irq -> setup_irq -> irq_desc->startup()
- */
-static unsigned int startup_cobalt_irq(unsigned int irq)
+static void ack_cobalt_irq(struct irq_data *data)
  {
         unsigned long flags;
-       struct irq_desc *desc = irq_to_desc(irq);
  
         spin_lock_irqsave(&cobalt_lock, flags);
-       if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-               desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
-       enable_cobalt_irq(irq);
-       spin_unlock_irqrestore(&cobalt_lock, flags);
-       return 0;
-}
-
-static void ack_cobalt_irq(unsigned int irq)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&cobalt_lock, flags);
-       disable_cobalt_irq(irq);
+       disable_cobalt_irq(data);
         apic_write(APIC_EOI, APIC_EIO_ACK);
         spin_unlock_irqrestore(&cobalt_lock, flags);
  }
  
-static void end_cobalt_irq(unsigned int irq)
-{
-       unsigned long flags;
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       spin_lock_irqsave(&cobalt_lock, flags);
-       if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
-               enable_cobalt_irq(irq);
-       spin_unlock_irqrestore(&cobalt_lock, flags);
-}
-
  static struct irq_chip cobalt_irq_type = {
-       .name =         "Cobalt-APIC",
-       .startup =      startup_cobalt_irq,
-       .shutdown =     disable_cobalt_irq,
-       .enable =       enable_cobalt_irq,
-       .disable =      disable_cobalt_irq,
-       .ack =          ack_cobalt_irq,
-       .end =          end_cobalt_irq,
+       .name           = "Cobalt-APIC",
+       .irq_enable     = enable_cobalt_irq,
+       .irq_disable    = disable_cobalt_irq,
+       .irq_ack        = ack_cobalt_irq,
  };
  
  
@@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = {
   * interrupt controller type, and through a special virtual interrupt-
   * controller. Device drivers only see the virtual interrupt sources.
   */
-static unsigned int startup_piix4_master_irq(unsigned int irq)
+static unsigned int startup_piix4_master_irq(struct irq_data *data)
  {
         legacy_pic->init(0);
-
-       return startup_cobalt_irq(irq);
+       enable_cobalt_irq(data);
  }
  
-static void end_piix4_master_irq(unsigned int irq)
+static void end_piix4_master_irq(struct irq_data *data)
  {
         unsigned long flags;
  
         spin_lock_irqsave(&cobalt_lock, flags);
-       enable_cobalt_irq(irq);
+       enable_cobalt_irq(data);
         spin_unlock_irqrestore(&cobalt_lock, flags);
  }
  
  static struct irq_chip piix4_master_irq_type = {
-       .name =         "PIIX4-master",
-       .startup =      startup_piix4_master_irq,
-       .ack =          ack_cobalt_irq,
-       .end =          end_piix4_master_irq,
+       .name           = "PIIX4-master",
+       .irq_startup    = startup_piix4_master_irq,
+       .irq_ack        = ack_cobalt_irq,
  };
  
+static void pii4_mask(struct irq_data *data) { }
  
  static struct irq_chip piix4_virtual_irq_type = {
-       .name =         "PIIX4-virtual",
+       .name           = "PIIX4-virtual",
+       .mask           = pii4_mask,
  };
  
-
  /*
   * PIIX4-8259 master/virtual functions to handle interrupt requests
   * from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = {
   */
  static irqreturn_t piix4_master_intr(int irq, void *dev_id)
  {
-       int realirq;
-       struct irq_desc *desc;
         unsigned long flags;
+       int realirq;
  
         raw_spin_lock_irqsave(&i8259A_lock, flags);
  
@@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
  
         raw_spin_unlock_irqrestore(&i8259A_lock, flags);
  
-       desc = irq_to_desc(realirq);
-
         /*
          * handle this 'virtual interrupt' as a Cobalt one now.
          */
-       kstat_incr_irqs_this_cpu(realirq, desc);
-
-       if (likely(desc->action != NULL))
-               handle_IRQ_event(realirq, desc->action);
-
-       if (!(desc->status & IRQ_DISABLED))
-               legacy_pic->chip->unmask(realirq);
+       generic_handle_irq(realirq);
  
         return IRQ_HANDLED;
  
@@ -624,41 +578,35 @@ static struct irqaction cascade_action = {
  
  static inline void set_piix4_virtual_irq_type(void)
  {
-       piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
         piix4_virtual_irq_type.enable = i8259A_chip.unmask;
         piix4_virtual_irq_type.disable = i8259A_chip.mask;
+       piix4_virtual_irq_type.unmask = i8259A_chip.unmask;
  }
  
-void init_VISWS_APIC_irqs(void)
+static void __init visws_pre_intr_init(void)
  {
         int i;
  
-       for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = 0;
-               desc->depth = 1;
+       set_piix4_virtual_irq_type();
  
-               if (i == 0) {
-                       desc->chip = &cobalt_irq_type;
-               }
-               else if (i == CO_IRQ_IDE0) {
-                       desc->chip = &cobalt_irq_type;
-               }
-               else if (i == CO_IRQ_IDE1) {
-                       desc->chip = &cobalt_irq_type;
-               }
-               else if (i == CO_IRQ_8259) {
-                       desc->chip = &piix4_master_irq_type;
-               }
-               else if (i < CO_IRQ_APIC0) {
-                       set_piix4_virtual_irq_type();
-                       desc->chip = &piix4_virtual_irq_type;
-               }
-               else if (IS_CO_APIC(i)) {
-                       desc->chip = &cobalt_irq_type;
-               }
+       for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
+               struct irq_chip *chip = NULL;
+
+               if (i == 0)
+                       chip = &cobalt_irq_type;
+               else if (i == CO_IRQ_IDE0)
+                       chip = &cobalt_irq_type;
+               else if (i == CO_IRQ_IDE1)
+                       >chip = &cobalt_irq_type;
+               else if (i == CO_IRQ_8259)
+                       chip = &piix4_master_irq_type;
+               else if (i < CO_IRQ_APIC0)
+                       chip = &piix4_virtual_irq_type;
+               else if (IS_CO_APIC(i))
+                       chip = &cobalt_irq_type;
+
+               if (chip)
+                       set_irq_chip(i, chip);
         }
  
         setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c

deleted file mode 100644 (file)

index ce9fbac..0000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VMI specific paravirt-ops implementation
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to zach@vmware.com
- *
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/vmi.h>
-#include <asm/io.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/pgalloc.h>
-#include <asm/processor.h>
-#include <asm/timer.h>
-#include <asm/vmi_time.h>
-#include <asm/kmap_types.h>
-#include <asm/setup.h>
-
-/* Convenient for calling VMI functions indirectly in the ROM */
-typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
-typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
-
-#define call_vrom_func(rom,func) \
-   (((VROMFUNC *)(rom->func))())
-
-#define call_vrom_long_func(rom,func,arg) \
-   (((VROMLONGFUNC *)(rom->func)) (arg))
-
-static struct vrom_header *vmi_rom;
-static int disable_pge;
-static int disable_pse;
-static int disable_sep;
-static int disable_tsc;
-static int disable_mtrr;
-static int disable_noidle;
-static int disable_vmi_timer;
-
-/* Cached VMI operations */
-static struct {
-       void (*cpuid)(void /* non-c */);
-       void (*_set_ldt)(u32 selector);
-       void (*set_tr)(u32 selector);
-       void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
-       void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
-       void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
-       void (*set_kernel_stack)(u32 selector, u32 sp0);
-       void (*allocate_page)(u32, u32, u32, u32, u32);
-       void (*release_page)(u32, u32);
-       void (*set_pte)(pte_t, pte_t *, unsigned);
-       void (*update_pte)(pte_t *, unsigned);
-       void (*set_linear_mapping)(int, void *, u32, u32);
-       void (*_flush_tlb)(int);
-       void (*set_initial_ap_state)(int, int);
-       void (*halt)(void);
-       void (*set_lazy_mode)(int mode);
-} vmi_ops;
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-/*
- * VMI patching routines.
- */
-#define MNEM_CALL 0xe8
-#define MNEM_JMP  0xe9
-#define MNEM_RET  0xc3
-
-#define IRQ_PATCH_INT_MASK 0
-#define IRQ_PATCH_DISABLE  5
-
-static inline void patch_offset(void *insnbuf,
-                               unsigned long ip, unsigned long dest)
-{
-        *(unsigned long *)(insnbuf+1) = dest-ip-5;
-}
-
-static unsigned patch_internal(int call, unsigned len, void *insnbuf,
-                              unsigned long ip)
-{
-       u64 reloc;
-       struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
-       switch(rel->type) {
-               case VMI_RELOCATION_CALL_REL:
-                       BUG_ON(len < 5);
-                       *(char *)insnbuf = MNEM_CALL;
-                       patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-                       return 5;
-
-               case VMI_RELOCATION_JUMP_REL:
-                       BUG_ON(len < 5);
-                       *(char *)insnbuf = MNEM_JMP;
-                       patch_offset(insnbuf, ip, (unsigned long)rel->eip);
-                       return 5;
-
-               case VMI_RELOCATION_NOP:
-                       /* obliterate the whole thing */
-                       return 0;
-
-               case VMI_RELOCATION_NONE:
-                       /* leave native code in place */
-                       break;
-
-               default:
-                       BUG();
-       }
-       return len;
-}
-
-/*
- * Apply patch if appropriate, return length of new instruction
- * sequence.  The callee does nop padding for us.
- */
-static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
-                         unsigned long ip, unsigned len)
-{
-       switch (type) {
-               case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
-                       return patch_internal(VMI_CALL_DisableInterrupts, len,
-                                             insns, ip);
-               case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
-                       return patch_internal(VMI_CALL_EnableInterrupts, len,
-                                             insns, ip);
-               case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
-                       return patch_internal(VMI_CALL_SetInterruptMask, len,
-                                             insns, ip);
-               case PARAVIRT_PATCH(pv_irq_ops.save_fl):
-                       return patch_internal(VMI_CALL_GetInterruptMask, len,
-                                             insns, ip);
-               case PARAVIRT_PATCH(pv_cpu_ops.iret):
-                       return patch_internal(VMI_CALL_IRET, len, insns, ip);
-               case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
-                       return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
-               default:
-                       break;
-       }
-       return len;
-}
-
-/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
-static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
-                               unsigned int *cx, unsigned int *dx)
-{
-       int override = 0;
-       if (*ax == 1)
-               override = 1;
-        asm volatile ("call *%6"
-                      : "=a" (*ax),
-                        "=b" (*bx),
-                        "=c" (*cx),
-                        "=d" (*dx)
-                      : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
-       if (override) {
-               if (disable_pse)
-                       *dx &= ~X86_FEATURE_PSE;
-               if (disable_pge)
-                       *dx &= ~X86_FEATURE_PGE;
-               if (disable_sep)
-                       *dx &= ~X86_FEATURE_SEP;
-               if (disable_tsc)
-                       *dx &= ~X86_FEATURE_TSC;
-               if (disable_mtrr)
-                       *dx &= ~X86_FEATURE_MTRR;
-       }
-}
-
-static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
-{
-       if (gdt[nr].a != new->a || gdt[nr].b != new->b)
-               write_gdt_entry(gdt, nr, new, 0);
-}
-
-static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
-{
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
-       vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
-}
-
-static void vmi_set_ldt(const void *addr, unsigned entries)
-{
-       unsigned cpu = smp_processor_id();
-       struct desc_struct desc;
-
-       pack_descriptor(&desc, (unsigned long)addr,
-                       entries * sizeof(struct desc_struct) - 1,
-                       DESC_LDT, 0);
-       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
-       vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
-}
-
-static void vmi_set_tr(void)
-{
-       vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
-}
-
-static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
-{
-       u32 *idt_entry = (u32 *)g;
-       vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
-}
-
-static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
-                               const void *desc, int type)
-{
-       u32 *gdt_entry = (u32 *)desc;
-       vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
-}
-
-static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
-                               const void *desc)
-{
-       u32 *ldt_entry = (u32 *)desc;
-       vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
-}
-
-static void vmi_load_sp0(struct tss_struct *tss,
-                                  struct thread_struct *thread)
-{
-       tss->x86_tss.sp0 = thread->sp0;
-
-       /* This can only happen when SEP is enabled, no need to test "SEP"arately */
-       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-               tss->x86_tss.ss1 = thread->sysenter_cs;
-               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-       }
-       vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
-}
-
-static void vmi_flush_tlb_user(void)
-{
-       vmi_ops._flush_tlb(VMI_FLUSH_TLB);
-}
-
-static void vmi_flush_tlb_kernel(void)
-{
-       vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
-}
-
-/* Stub to do nothing at all; used for delays and unimplemented calls */
-static void vmi_nop(void)
-{
-}
-
-static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
-{
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-       /*
-        * This call comes in very early, before mem_map is setup.
-        * It is called only for swapper_pg_dir, which already has
-        * data on it.
-        */
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
-}
-
-static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
-{
-       vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
-}
-
-static void vmi_release_pte(unsigned long pfn)
-{
-       vmi_ops.release_page(pfn, VMI_PAGE_L1);
-}
-
-static void vmi_release_pmd(unsigned long pfn)
-{
-       vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * We use the pgd_free hook for releasing the pgd page:
- */
-static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-       unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
-
-       vmi_ops.release_page(pfn, VMI_PAGE_L2);
-}
-
-/*
- * Helper macros for MMU update flags.  We can defer updates until a flush
- * or page invalidation only if the update is to the current address space
- * (otherwise, there is no flush).  We must check against init_mm, since
- * this could be a kernel update, which usually passes init_mm, although
- * sometimes this check can be skipped if we know the particular function
- * is only called on user mode PTEs.  We could change the kernel to pass
- * current->active_mm here, but in particular, I was unsure if changing
- * mm/highmem.c to do this would still be correct on other architectures.
- */
-#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm ||    \
-                                       (!mustbeuser && (mm) == &init_mm))
-#define vmi_flags_addr(mm, addr, level, user)                           \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-#define vmi_flags_addr_defer(mm, addr, level, user)                     \
-        ((level) | (is_current_as(mm, user) ?                           \
-                (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
-
-static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pte(pte_t *ptep, pte_t pte)
-{
-       /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
-       vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
-{
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
-{
-#ifdef CONFIG_X86_PAE
-       const pte_t pte = { .pte = pmdval.pmd };
-#else
-       const pte_t pte = { pmdval.pud.pgd.pgd };
-#endif
-       vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
-}
-
-#ifdef CONFIG_X86_PAE
-
-static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
-{
-       /*
-        * XXX This is called from set_pmd_pte, but at both PT
-        * and PD layers so the VMI_PAGE_PT flag is wrong.  But
-        * it is only called for large page mapping changes,
-        * the Xen backend, doesn't support large pages, and the
-        * ESX backend doesn't depend on the flag.
-        */
-       set_64bit((unsigned long long *)ptep,pte_val(pteval));
-       vmi_ops.update_pte(ptep, VMI_PAGE_PT);
-}
-
-static void vmi_set_pud(pud_t *pudp, pud_t pudval)
-{
-       /* Um, eww */
-       const pte_t pte = { .pte = pudval.pgd.pgd };
-       vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
-}
-
-static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-       const pte_t pte = { .pte = 0 };
-       vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
-}
-
-static void vmi_pmd_clear(pmd_t *pmd)
-{
-       const pte_t pte = { .pte = 0 };
-       vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
-}
-#endif
-
-#ifdef CONFIG_SMP
-static void __devinit
-vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
-                    unsigned long start_esp)
-{
-       struct vmi_ap_state ap;
-
-       /* Default everything to zero.  This is fine for most GPRs. */
-       memset(&ap, 0, sizeof(struct vmi_ap_state));
-
-       ap.gdtr_limit = GDT_SIZE - 1;
-       ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
-
-       ap.idtr_limit = IDT_ENTRIES * 8 - 1;
-       ap.idtr_base = (unsigned long) idt_table;
-
-       ap.ldtr = 0;
-
-       ap.cs = __KERNEL_CS;
-       ap.eip = (unsigned long) start_eip;
-       ap.ss = __KERNEL_DS;
-       ap.esp = (unsigned long) start_esp;
-
-       ap.ds = __USER_DS;
-       ap.es = __USER_DS;
-       ap.fs = __KERNEL_PERCPU;
-       ap.gs = __KERNEL_STACK_CANARY;
-
-       ap.eflags = 0;
-
-#ifdef CONFIG_X86_PAE
-       /* efer should match BSP efer. */
-       if (cpu_has_nx) {
-               unsigned l, h;
-               rdmsr(MSR_EFER, l, h);
-               ap.efer = (unsigned long long) h << 32 | l;
-       }
-#endif
-
-       ap.cr3 = __pa(swapper_pg_dir);
-       /* Protected mode, paging, AM, WP, NE, MP. */
-       ap.cr0 = 0x80050023;
-       ap.cr4 = mmu_cr4_features;
-       vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
-}
-#endif
-
-static void vmi_start_context_switch(struct task_struct *prev)
-{
-       paravirt_start_context_switch(prev);
-       vmi_ops.set_lazy_mode(2);
-}
-
-static void vmi_end_context_switch(struct task_struct *next)
-{
-       vmi_ops.set_lazy_mode(0);
-       paravirt_end_context_switch(next);
-}
-
-static void vmi_enter_lazy_mmu(void)
-{
-       paravirt_enter_lazy_mmu();
-       vmi_ops.set_lazy_mode(1);
-}
-
-static void vmi_leave_lazy_mmu(void)
-{
-       vmi_ops.set_lazy_mode(0);
-       paravirt_leave_lazy_mmu();
-}
-
-static inline int __init check_vmi_rom(struct vrom_header *rom)
-{
-       struct pci_header *pci;
-       struct pnp_header *pnp;
-       const char *manufacturer = "UNKNOWN";
-       const char *product = "UNKNOWN";
-       const char *license = "unspecified";
-
-       if (rom->rom_signature != 0xaa55)
-               return 0;
-       if (rom->vrom_signature != VMI_SIGNATURE)
-               return 0;
-       if (rom->api_version_maj != VMI_API_REV_MAJOR ||
-           rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
-               printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
-                               rom->api_version_maj,
-                               rom->api_version_min);
-               return 0;
-       }
-
-       /*
-        * Relying on the VMI_SIGNATURE field is not 100% safe, so check
-        * the PCI header and device type to make sure this is really a
-        * VMI device.
-        */
-       if (!rom->pci_header_offs) {
-               printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
-               return 0;
-       }
-
-       pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
-       if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
-           pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
-               /* Allow it to run... anyways, but warn */
-               printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
-       }
-
-       if (rom->pnp_header_offs) {
-               pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
-               if (pnp->manufacturer_offset)
-                       manufacturer = (const char *)rom+pnp->manufacturer_offset;
-               if (pnp->product_offset)
-                       product = (const char *)rom+pnp->product_offset;
-       }
-
-       if (rom->license_offs)
-               license = (char *)rom+rom->license_offs;
-
-       printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
-               manufacturer, product,
-               rom->api_version_maj, rom->api_version_min,
-               pci->rom_version_maj, pci->rom_version_min);
-
-       /* Don't allow BSD/MIT here for now because we don't want to end up
-          with any binary only shim layers */
-       if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
-               printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
-                       license);
-               return 0;
-       }
-
-       return 1;
-}
-
-/*
- * Probe for the VMI option ROM
- */
-static inline int __init probe_vmi_rom(void)
-{
-       unsigned long base;
-
-       /* VMI ROM is in option ROM area, check signature */
-       for (base = 0xC0000; base < 0xE0000; base += 2048) {
-               struct vrom_header *romstart;
-               romstart = (struct vrom_header *)isa_bus_to_virt(base);
-               if (check_vmi_rom(romstart)) {
-                       vmi_rom = romstart;
-                       return 1;
-               }
-       }
-       return 0;
-}
-
-/*
- * VMI setup common to all processors
- */
-void vmi_bringup(void)
-{
-       /* We must establish the lowmem mapping for MMU ops to work */
-       if (vmi_ops.set_linear_mapping)
-               vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
-}
-
-/*
- * Return a pointer to a VMI function or NULL if unimplemented
- */
-static void *vmi_get_function(int vmicall)
-{
-       u64 reloc;
-       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
-       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
-       if (rel->type == VMI_RELOCATION_CALL_REL)
-               return (void *)rel->eip;
-       else
-               return NULL;
-}
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For unimplemented operations, fall back to default, unless nop
- * is returned by the ROM.
- */
-#define para_fill(opname, vmicall)                             \
-do {                                                           \
-       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                   VMI_CALL_##vmicall);        \
-       if (rel->type == VMI_RELOCATION_CALL_REL)               \
-               opname = (void *)rel->eip;                      \
-       else if (rel->type == VMI_RELOCATION_NOP)               \
-               opname = (void *)vmi_nop;                       \
-       else if (rel->type != VMI_RELOCATION_NONE)              \
-               printk(KERN_WARNING "VMI: Unknown relocation "  \
-                                   "type %d for " #vmicall"\n",\
-                                       rel->type);             \
-} while (0)
-
-/*
- * Helper macro for making the VMI paravirt-ops fill code readable.
- * For cached operations which do not match the VMI ROM ABI and must
- * go through a tranlation stub.  Ignore NOPs, since it is not clear
- * a NOP * VMI function corresponds to a NOP paravirt-op when the
- * functions are not in 1-1 correspondence.
- */
-#define para_wrap(opname, wrapper, cache, vmicall)             \
-do {                                                           \
-       reloc = call_vrom_long_func(vmi_rom, get_reloc,         \
-                                   VMI_CALL_##vmicall);        \
-       BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);           \
-       if (rel->type == VMI_RELOCATION_CALL_REL) {             \
-               opname = wrapper;                               \
-               vmi_ops.cache = (void *)rel->eip;               \
-       }                                                       \
-} while (0)
-
-/*
- * Activate the VMI interface and switch into paravirtualized mode
- */
-static inline int __init activate_vmi(void)
-{
-       short kernel_cs;
-       u64 reloc;
-       const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
-
-       /*
-        * Prevent page tables from being allocated in highmem, even if
-        * CONFIG_HIGHPTE is enabled.
-        */
-       __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
-
-       if (call_vrom_func(vmi_rom, vmi_init) != 0) {
-               printk(KERN_ERR "VMI ROM failed to initialize!");
-               return 0;
-       }
-       savesegment(cs, kernel_cs);
-
-       pv_info.paravirt_enabled = 1;
-       pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
-       pv_info.name = "vmi [deprecated]";
-
-       pv_init_ops.patch = vmi_patch;
-
-       /*
-        * Many of these operations are ABI compatible with VMI.
-        * This means we can fill in the paravirt-ops with direct
-        * pointers into the VMI ROM.  If the calling convention for
-        * these operations changes, this code needs to be updated.
-        *
-        * Exceptions
-        *  CPUID paravirt-op uses pointers, not the native ISA
-        *  halt has no VMI equivalent; all VMI halts are "safe"
-        *  no MSR support yet - just trap and emulate.  VMI uses the
-        *    same ABI as the native ISA, but Linux wants exceptions
-        *    from bogus MSR read / write handled
-        *  rdpmc is not yet used in Linux
-        */
-
-       /* CPUID is special, so very special it gets wrapped like a present */
-       para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-
-       para_fill(pv_cpu_ops.clts, CLTS);
-       para_fill(pv_cpu_ops.get_debugreg, GetDR);
-       para_fill(pv_cpu_ops.set_debugreg, SetDR);
-       para_fill(pv_cpu_ops.read_cr0, GetCR0);
-       para_fill(pv_mmu_ops.read_cr2, GetCR2);
-       para_fill(pv_mmu_ops.read_cr3, GetCR3);
-       para_fill(pv_cpu_ops.read_cr4, GetCR4);
-       para_fill(pv_cpu_ops.write_cr0, SetCR0);
-       para_fill(pv_mmu_ops.write_cr2, SetCR2);
-       para_fill(pv_mmu_ops.write_cr3, SetCR3);
-       para_fill(pv_cpu_ops.write_cr4, SetCR4);
-
-       para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
-       para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
-       para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
-       para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
-
-       para_fill(pv_cpu_ops.wbinvd, WBINVD);
-       para_fill(pv_cpu_ops.read_tsc, RDTSC);
-
-       /* The following we emulate with trap and emulate for now */
-       /* paravirt_ops.read_msr = vmi_rdmsr */
-       /* paravirt_ops.write_msr = vmi_wrmsr */
-       /* paravirt_ops.rdpmc = vmi_rdpmc */
-
-       /* TR interface doesn't pass TR value, wrap */
-       para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
-
-       /* LDT is special, too */
-       para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-
-       para_fill(pv_cpu_ops.load_gdt, SetGDT);
-       para_fill(pv_cpu_ops.load_idt, SetIDT);
-       para_fill(pv_cpu_ops.store_gdt, GetGDT);
-       para_fill(pv_cpu_ops.store_idt, GetIDT);
-       para_fill(pv_cpu_ops.store_tr, GetTR);
-       pv_cpu_ops.load_tls = vmi_load_tls;
-       para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
-                 write_ldt_entry, WriteLDTEntry);
-       para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
-                 write_gdt_entry, WriteGDTEntry);
-       para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
-                 write_idt_entry, WriteIDTEntry);
-       para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
-       para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
-       para_fill(pv_cpu_ops.io_delay, IODelay);
-
-       para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
-                 set_lazy_mode, SetLazyMode);
-       para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
-                 set_lazy_mode, SetLazyMode);
-
-       para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
-                 set_lazy_mode, SetLazyMode);
-       para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
-                 set_lazy_mode, SetLazyMode);
-
-       /* user and kernel flush are just handled with different flags to FlushTLB */
-       para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-       para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-       para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
-
-       /*
-        * Until a standard flag format can be agreed on, we need to
-        * implement these as wrappers in Linux.  Get the VMI ROM
-        * function pointers for the two backend calls.
-        */
-#ifdef CONFIG_X86_PAE
-       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
-       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
-#else
-       vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
-       vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
-#endif
-
-       if (vmi_ops.set_pte) {
-               pv_mmu_ops.set_pte = vmi_set_pte;
-               pv_mmu_ops.set_pte_at = vmi_set_pte_at;
-               pv_mmu_ops.set_pmd = vmi_set_pmd;
-#ifdef CONFIG_X86_PAE
-               pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-               pv_mmu_ops.set_pud = vmi_set_pud;
-               pv_mmu_ops.pte_clear = vmi_pte_clear;
-               pv_mmu_ops.pmd_clear = vmi_pmd_clear;
-#endif
-       }
-
-       if (vmi_ops.update_pte) {
-               pv_mmu_ops.pte_update = vmi_update_pte;
-               pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
-       }
-
-       vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
-       if (vmi_ops.allocate_page) {
-               pv_mmu_ops.alloc_pte = vmi_allocate_pte;
-               pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
-               pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
-       }
-
-       vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
-       if (vmi_ops.release_page) {
-               pv_mmu_ops.release_pte = vmi_release_pte;
-               pv_mmu_ops.release_pmd = vmi_release_pmd;
-               pv_mmu_ops.pgd_free = vmi_pgd_free;
-       }
-
-       /* Set linear is needed in all cases */
-       vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-
-       /*
-        * These MUST always be patched.  Don't support indirect jumps
-        * through these operations, as the VMI interface may use either
-        * a jump or a call to get to these operations, depending on
-        * the backend.  They are performance critical anyway, so requiring
-        * a patch is not a big problem.
-        */
-       pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-       pv_cpu_ops.iret = (void *)0xbadbab0;
-
-#ifdef CONFIG_SMP
-       para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic->read, APICRead);
-       para_fill(apic->write, APICWrite);
-#endif
-
-       /*
-        * Check for VMI timer functionality by probing for a cycle frequency method
-        */
-       reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
-       if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
-               vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
-               vmi_timer_ops.get_cycle_counter =
-                       vmi_get_function(VMI_CALL_GetCycleCounter);
-               vmi_timer_ops.get_wallclock =
-                       vmi_get_function(VMI_CALL_GetWallclockTime);
-               vmi_timer_ops.wallclock_updated =
-                       vmi_get_function(VMI_CALL_WallclockUpdated);
-               vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
-               vmi_timer_ops.cancel_alarm =
-                        vmi_get_function(VMI_CALL_CancelAlarm);
-               x86_init.timers.timer_init = vmi_time_init;
-#ifdef CONFIG_X86_LOCAL_APIC
-               x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
-               x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
-#endif
-               pv_time_ops.sched_clock = vmi_sched_clock;
-               x86_platform.calibrate_tsc = vmi_tsc_khz;
-               x86_platform.get_wallclock = vmi_get_wallclock;
-               x86_platform.set_wallclock = vmi_set_wallclock;
-
-               /* We have true wallclock functions; disable CMOS clock sync */
-               no_sync_cmos_clock = 1;
-       } else {
-               disable_noidle = 1;
-               disable_vmi_timer = 1;
-       }
-
-       para_fill(pv_irq_ops.safe_halt, Halt);
-
-       /*
-        * Alternative instruction rewriting doesn't happen soon enough
-        * to convert VMI_IRET to a call instead of a jump; so we have
-        * to do this before IRQs get reenabled.  Fortunately, it is
-        * idempotent.
-        */
-       apply_paravirt(__parainstructions, __parainstructions_end);
-
-       vmi_bringup();
-
-       return 1;
-}
-
-#undef para_fill
-
-void __init vmi_init(void)
-{
-       if (!vmi_rom)
-               probe_vmi_rom();
-       else
-               check_vmi_rom(vmi_rom);
-
-       /* In case probing for or validating the ROM failed, basil */
-       if (!vmi_rom)
-               return;
-
-       reserve_top_address(-vmi_rom->virtual_top);
-
-#ifdef CONFIG_X86_IO_APIC
-       /* This is virtual hardware; timer routing is wired correctly */
-       no_timer_check = 1;
-#endif
-}
-
-void __init vmi_activate(void)
-{
-       unsigned long flags;
-
-       if (!vmi_rom)
-               return;
-
-       local_irq_save(flags);
-       activate_vmi();
-       local_irq_restore(flags & X86_EFLAGS_IF);
-}
-
-static int __init parse_vmi(char *arg)
-{
-       if (!arg)
-               return -EINVAL;
-
-       if (!strcmp(arg, "disable_pge")) {
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
-               disable_pge = 1;
-       } else if (!strcmp(arg, "disable_pse")) {
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
-               disable_pse = 1;
-       } else if (!strcmp(arg, "disable_sep")) {
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
-               disable_sep = 1;
-       } else if (!strcmp(arg, "disable_tsc")) {
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
-               disable_tsc = 1;
-       } else if (!strcmp(arg, "disable_mtrr")) {
-               clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
-               disable_mtrr = 1;
-       } else if (!strcmp(arg, "disable_timer")) {
-               disable_vmi_timer = 1;
-               disable_noidle = 1;
-       } else if (!strcmp(arg, "disable_noidle"))
-               disable_noidle = 1;
-       return 0;
-}
-
-early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c

deleted file mode 100644 (file)

index 5e1ff66..0000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2007, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/cpumask.h>
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-#include <asm/apicdef.h>
-#include <asm/apic.h>
-#include <asm/timer.h>
-#include <asm/i8253.h>
-#include <asm/irq_vectors.h>
-
-#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
-
-static DEFINE_PER_CPU(struct clock_event_device, local_events);
-
-static inline u32 vmi_counter(u32 flags)
-{
-       /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
-        * cycle counter. */
-       return flags & VMI_ALARM_COUNTER_MASK;
-}
-
-/* paravirt_ops.get_wallclock = vmi_get_wallclock */
-unsigned long vmi_get_wallclock(void)
-{
-       unsigned long long wallclock;
-       wallclock = vmi_timer_ops.get_wallclock(); // nsec
-       (void)do_div(wallclock, 1000000000);       // sec
-
-       return wallclock;
-}
-
-/* paravirt_ops.set_wallclock = vmi_set_wallclock */
-int vmi_set_wallclock(unsigned long now)
-{
-       return 0;
-}
-
-/* paravirt_ops.sched_clock = vmi_sched_clock */
-unsigned long long vmi_sched_clock(void)
-{
-       return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
-}
-
-/* x86_platform.calibrate_tsc = vmi_tsc_khz */
-unsigned long vmi_tsc_khz(void)
-{
-       unsigned long long khz;
-       khz = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(khz, 1000);
-       return khz;
-}
-
-static inline unsigned int vmi_get_timer_vector(void)
-{
-       return IRQ0_VECTOR;
-}
-
-/** vmi clockchip */
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned int startup_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-
-       return (val & APIC_SEND_PENDING);
-}
-
-static void mask_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
-}
-
-static void unmask_timer_irq(unsigned int irq)
-{
-       unsigned long val = apic_read(APIC_LVTT);
-       apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
-}
-
-static void ack_timer_irq(unsigned int irq)
-{
-       ack_APIC_irq();
-}
-
-static struct irq_chip vmi_chip __read_mostly = {
-       .name           = "VMI-LOCAL",
-       .startup        = startup_timer_irq,
-       .mask           = mask_timer_irq,
-       .unmask         = unmask_timer_irq,
-       .ack            = ack_timer_irq
-};
-#endif
-
-/** vmi clockevent */
-#define VMI_ALARM_WIRED_IRQ0    0x00000000
-#define VMI_ALARM_WIRED_LVTT    0x00010000
-static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
-
-static inline int vmi_get_alarm_wiring(void)
-{
-       return vmi_wiring;
-}
-
-static void vmi_timer_set_mode(enum clock_event_mode mode,
-                              struct clock_event_device *evt)
-{
-       cycle_t now, cycles_per_hz;
-       BUG_ON(!irqs_disabled());
-
-       switch (mode) {
-       case CLOCK_EVT_MODE_ONESHOT:
-       case CLOCK_EVT_MODE_RESUME:
-               break;
-       case CLOCK_EVT_MODE_PERIODIC:
-               cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
-               (void)do_div(cycles_per_hz, HZ);
-               now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
-               vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
-               break;
-       case CLOCK_EVT_MODE_UNUSED:
-       case CLOCK_EVT_MODE_SHUTDOWN:
-               switch (evt->mode) {
-               case CLOCK_EVT_MODE_ONESHOT:
-                       vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
-                       break;
-               case CLOCK_EVT_MODE_PERIODIC:
-                       vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
-                       break;
-               default:
-                       break;
-               }
-               break;
-       default:
-               break;
-       }
-}
-
-static int vmi_timer_next_event(unsigned long delta,
-                               struct clock_event_device *evt)
-{
-       /* Unfortunately, set_next_event interface only passes relative
-        * expiry, but we want absolute expiry.  It'd be better if were
-        * were passed an absolute expiry, since a bunch of time may
-        * have been stolen between the time the delta is computed and
-        * when we set the alarm below. */
-       cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
-
-       BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
-       vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
-       return 0;
-}
-
-static struct clock_event_device vmi_clockevent = {
-       .name           = "vmi-timer",
-       .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .shift          = 22,
-       .set_mode       = vmi_timer_set_mode,
-       .set_next_event = vmi_timer_next_event,
-       .rating         = 1000,
-       .irq            = 0,
-};
-
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-       struct clock_event_device *evt = &__get_cpu_var(local_events);
-       evt->event_handler(evt);
-       return IRQ_HANDLED;
-}
-
-static struct irqaction vmi_clock_action  = {
-       .name           = "vmi-timer",
-       .handler        = vmi_timer_interrupt,
-       .flags          = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
-};
-
-static void __devinit vmi_time_init_clockevent(void)
-{
-       cycle_t cycles_per_msec;
-       struct clock_event_device *evt;
-
-       int cpu = smp_processor_id();
-       evt = &__get_cpu_var(local_events);
-
-       /* Use cycles_per_msec since div_sc params are 32-bits. */
-       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(cycles_per_msec, 1000);
-
-       memcpy(evt, &vmi_clockevent, sizeof(*evt));
-       /* Must pick .shift such that .mult fits in 32-bits.  Choosing
-        * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
-        * before overflow. */
-       evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
-       /* Upper bound is clockevent's use of ulong for cycle deltas. */
-       evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
-       evt->min_delta_ns = clockevent_delta2ns(1, evt);
-       evt->cpumask = cpumask_of(cpu);
-
-       printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
-              evt->name, evt->mult, evt->shift);
-       clockevents_register_device(evt);
-}
-
-void __init vmi_time_init(void)
-{
-       unsigned int cpu;
-       /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
-       outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-       vmi_time_init_clockevent();
-       setup_irq(0, &vmi_clock_action);
-       for_each_possible_cpu(cpu)
-               per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-void __devinit vmi_time_bsp_init(void)
-{
-       /*
-        * On APIC systems, we want local timers to fire on each cpu.  We do
-        * this by programming LVTT to deliver timer events to the IRQ handler
-        * for IRQ-0, since we can't re-use the APIC local timer handler
-        * without interfering with that code.
-        */
-       clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-       local_irq_disable();
-#ifdef CONFIG_SMP
-       /*
-        * XXX handle_percpu_irq only defined for SMP; we need to switch over
-        * to using it, since this is a local interrupt, which each CPU must
-        * handle individually without locking out or dropping simultaneous
-        * local timers on other CPUs.  We also don't want to trigger the
-        * quirk workaround code for interrupts which gets invoked from
-        * handle_percpu_irq via eoi, so we use our own IRQ chip.
-        */
-       set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
-#else
-       set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
-#endif
-       vmi_wiring = VMI_ALARM_WIRED_LVTT;
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-       local_irq_enable();
-       clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-}
-
-void __devinit vmi_time_ap_init(void)
-{
-       vmi_time_init_clockevent();
-       apic_write(APIC_LVTT, vmi_get_timer_vector());
-}
-#endif
-
-/** vmi clocksource */
-static struct clocksource clocksource_vmi;
-
-static cycle_t read_real_cycles(struct clocksource *cs)
-{
-       cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-       return max(ret, clocksource_vmi.cycle_last);
-}
-
-static struct clocksource clocksource_vmi = {
-       .name                   = "vmi-timer",
-       .rating                 = 450,
-       .read                   = read_real_cycles,
-       .mask                   = CLOCKSOURCE_MASK(64),
-       .mult                   = 0, /* to be set */
-       .shift                  = 22,
-       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static int __init init_vmi_clocksource(void)
-{
-       cycle_t cycles_per_msec;
-
-       if (!vmi_timer_ops.get_cycle_frequency)
-               return 0;
-       /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
-       cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
-       (void)do_div(cycles_per_msec, 1000);
-
-       /* Note that clocksource.{mult, shift} converts in the opposite direction
-        * as clockevents.  */
-       clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-                                                   clocksource_vmi.shift);
-
-       printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
-       return clocksource_register(&clocksource_vmi);
-
-}
-module_init(init_vmi_clocksource);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c

index 77d8c0f..22b06f7 100644 (file)
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
  
         vcpu->arch.apic = apic;
  
-       apic->regs_page = alloc_page(GFP_KERNEL);
+       apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
         if (apic->regs_page == NULL) {
                 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
                        vcpu->vcpu_id);
                 goto nomem_free_apic;
         }
         apic->regs = page_address(apic->regs_page);
-       memset(apic->regs, 0, PAGE_SIZE);
         apic->vcpu = vcpu;
  
         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 3a09c62..6c2ecf0 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
                 0 /* Reserved, DCA */ | F(XMM4_1) |
                 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-               0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
+               0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+               F(F16C);
         /* cpuid 0x80000001.ecx */
         const u32 kvm_supported_word6_x86_features =
                 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
                 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-               F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
-               0 /* SKINIT */ | 0 /* WDT */;
+               F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+               0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
  
         /* all calls to cpuid_count() should be made on the same cpu */
         get_cpu();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c

index 9d5f558..73b1e1a 100644 (file)
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -791,22 +791,22 @@ static void lguest_flush_tlb_kernel(void)
   * simple as setting a bit.  We don't actually "ack" interrupts as such, we
   * just mask and unmask them.  I wonder if we should be cleverer?
   */
-static void disable_lguest_irq(unsigned int irq)
+static void disable_lguest_irq(struct irq_data *data)
  {
-       set_bit(irq, lguest_data.blocked_interrupts);
+       set_bit(data->irq, lguest_data.blocked_interrupts);
  }
  
-static void enable_lguest_irq(unsigned int irq)
+static void enable_lguest_irq(struct irq_data *data)
  {
-       clear_bit(irq, lguest_data.blocked_interrupts);
+       clear_bit(data->irq, lguest_data.blocked_interrupts);
  }
  
  /* This structure describes the lguest IRQ controller. */
  static struct irq_chip lguest_irq_controller = {
         .name           = "lguest",
-       .mask           = disable_lguest_irq,
-       .mask_ack       = disable_lguest_irq,
-       .unmask         = enable_lguest_irq,
+       .irq_mask       = disable_lguest_irq,
+       .irq_mask_ack   = disable_lguest_irq,
+       .irq_unmask     = enable_lguest_irq,
  };
  
  /*
@@ -838,12 +838,12 @@ static void __init lguest_init_IRQ(void)
   * rather than set them in lguest_init_IRQ we are called here every time an
   * lguest device needs an interrupt.
   *
- * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should
+ * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
   * pass that up!
   */
  void lguest_setup_irq(unsigned int irq)
  {
-       irq_to_desc_alloc_node(irq, 0);
+       irq_alloc_desc_at(irq, 0);
         set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
                                       handle_level_irq, "level");
  }
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c

index 5415a9d..b908a59 100644 (file)
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
  
  void *memmove(void *dest, const void *src, size_t n)
  {
-       int d0, d1, d2;
-
-       if (dest < src) {
-               memcpy(dest, src, n);
-       } else {
-               __asm__ __volatile__(
-                       "std\n\t"
-                       "rep\n\t"
-                       "movsb\n\t"
-                       "cld"
-                       : "=&c" (d0), "=&S" (d1), "=&D" (d2)
-                       :"0" (n),
-                        "1" (n-1+src),
-                        "2" (n-1+dest)
-                       :"memory");
-       }
-       return dest;
+       int d0,d1,d2,d3,d4,d5;
+       char *ret = dest;
+
+       __asm__ __volatile__(
+               /* Handle more 16bytes in loop */
+               "cmp $0x10, %0\n\t"
+               "jb     1f\n\t"
+
+               /* Decide forward/backward copy mode */
+               "cmp %2, %1\n\t"
+               "jb     2f\n\t"
+
+               /*
+                * movs instruction have many startup latency
+                * so we handle small size by general register.
+                */
+               "cmp  $680, %0\n\t"
+               "jb 3f\n\t"
+               /*
+                * movs instruction is only good for aligned case.
+                */
+               "mov %1, %3\n\t"
+               "xor %2, %3\n\t"
+               "and $0xff, %3\n\t"
+               "jz 4f\n\t"
+               "3:\n\t"
+               "sub $0x10, %0\n\t"
+
+               /*
+                * We gobble 16byts forward in each loop.
+                */
+               "3:\n\t"
+               "sub $0x10, %0\n\t"
+               "mov 0*4(%1), %3\n\t"
+               "mov 1*4(%1), %4\n\t"
+               "mov  %3, 0*4(%2)\n\t"
+               "mov  %4, 1*4(%2)\n\t"
+               "mov 2*4(%1), %3\n\t"
+               "mov 3*4(%1), %4\n\t"
+               "mov  %3, 2*4(%2)\n\t"
+               "mov  %4, 3*4(%2)\n\t"
+               "lea  0x10(%1), %1\n\t"
+               "lea  0x10(%2), %2\n\t"
+               "jae 3b\n\t"
+               "add $0x10, %0\n\t"
+               "jmp 1f\n\t"
+
+               /*
+                * Handle data forward by movs.
+                */
+               ".p2align 4\n\t"
+               "4:\n\t"
+               "mov -4(%1, %0), %3\n\t"
+               "lea -4(%2, %0), %4\n\t"
+               "shr $2, %0\n\t"
+               "rep movsl\n\t"
+               "mov %3, (%4)\n\t"
+               "jmp 11f\n\t"
+               /*
+                * Handle data backward by movs.
+                */
+               ".p2align 4\n\t"
+               "6:\n\t"
+               "mov (%1), %3\n\t"
+               "mov %2, %4\n\t"
+               "lea -4(%1, %0), %1\n\t"
+               "lea -4(%2, %0), %2\n\t"
+               "shr $2, %0\n\t"
+               "std\n\t"
+               "rep movsl\n\t"
+               "mov %3,(%4)\n\t"
+               "cld\n\t"
+               "jmp 11f\n\t"
+
+               /*
+                * Start to prepare for backward copy.
+                */
+               ".p2align 4\n\t"
+               "2:\n\t"
+               "cmp  $680, %0\n\t"
+               "jb 5f\n\t"
+               "mov %1, %3\n\t"
+               "xor %2, %3\n\t"
+               "and $0xff, %3\n\t"
+               "jz 6b\n\t"
+
+               /*
+                * Calculate copy position to tail.
+                */
+               "5:\n\t"
+               "add %0, %1\n\t"
+               "add %0, %2\n\t"
+               "sub $0x10, %0\n\t"
+
+               /*
+                * We gobble 16byts backward in each loop.
+                */
+               "7:\n\t"
+               "sub $0x10, %0\n\t"
+
+               "mov -1*4(%1), %3\n\t"
+               "mov -2*4(%1), %4\n\t"
+               "mov  %3, -1*4(%2)\n\t"
+               "mov  %4, -2*4(%2)\n\t"
+               "mov -3*4(%1), %3\n\t"
+               "mov -4*4(%1), %4\n\t"
+               "mov  %3, -3*4(%2)\n\t"
+               "mov  %4, -4*4(%2)\n\t"
+               "lea  -0x10(%1), %1\n\t"
+               "lea  -0x10(%2), %2\n\t"
+               "jae 7b\n\t"
+               /*
+                * Calculate copy position to head.
+                */
+               "add $0x10, %0\n\t"
+               "sub %0, %1\n\t"
+               "sub %0, %2\n\t"
+
+               /*
+                * Move data from 8 bytes to 15 bytes.
+                */
+               ".p2align 4\n\t"
+               "1:\n\t"
+               "cmp $8, %0\n\t"
+               "jb 8f\n\t"
+               "mov 0*4(%1), %3\n\t"
+               "mov 1*4(%1), %4\n\t"
+               "mov -2*4(%1, %0), %5\n\t"
+               "mov -1*4(%1, %0), %1\n\t"
+
+               "mov  %3, 0*4(%2)\n\t"
+               "mov  %4, 1*4(%2)\n\t"
+               "mov  %5, -2*4(%2, %0)\n\t"
+               "mov  %1, -1*4(%2, %0)\n\t"
+               "jmp 11f\n\t"
+
+               /*
+                * Move data from 4 bytes to 7 bytes.
+                */
+               ".p2align 4\n\t"
+               "8:\n\t"
+               "cmp $4, %0\n\t"
+               "jb 9f\n\t"
+               "mov 0*4(%1), %3\n\t"
+               "mov -1*4(%1, %0), %4\n\t"
+               "mov  %3, 0*4(%2)\n\t"
+               "mov  %4, -1*4(%2, %0)\n\t"
+               "jmp 11f\n\t"
+
+               /*
+                * Move data from 2 bytes to 3 bytes.
+                */
+               ".p2align 4\n\t"
+               "9:\n\t"
+               "cmp $2, %0\n\t"
+               "jb 10f\n\t"
+               "movw 0*2(%1), %%dx\n\t"
+               "movw -1*2(%1, %0), %%bx\n\t"
+               "movw %%dx, 0*2(%2)\n\t"
+               "movw %%bx, -1*2(%2, %0)\n\t"
+               "jmp 11f\n\t"
+
+               /*
+                * Move data for 1 byte.
+                */
+               ".p2align 4\n\t"
+               "10:\n\t"
+               "cmp $1, %0\n\t"
+               "jb 11f\n\t"
+               "movb (%1), %%cl\n\t"
+               "movb %%cl, (%2)\n\t"
+               ".p2align 4\n\t"
+               "11:"
+               : "=&c" (d0), "=&S" (d1), "=&D" (d2),
+                 "=r" (d3),"=r" (d4), "=r"(d5)
+               :"0" (n),
+                "1" (src),
+                "2" (dest)
+               :"memory");
+
+       return ret;
+
  }
  EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S

index bcbcd1e..75ef61e 100644 (file)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
  ENTRY(__memcpy)
  ENTRY(memcpy)
         CFI_STARTPROC
+       movq %rdi, %rax
  
         /*
-        * Put the number of full 64-byte blocks into %ecx.
-        * Tail portion is handled at the end:
+        * Use 32bit CMP here to avoid long NOP padding.
          */
-       movq %rdi, %rax
-       movl %edx, %ecx
-       shrl   $6, %ecx
-       jz .Lhandle_tail
+       cmp  $0x20, %edx
+       jb .Lhandle_tail
  
-       .p2align 4
-.Lloop_64:
         /*
-        * We decrement the loop index here - and the zero-flag is
-        * checked at the end of the loop (instructions inbetween do
-        * not change the zero flag):
+        * We check whether memory false dependece could occur,
+        * then jump to corresponding copy mode.
          */
-       decl %ecx
+       cmp  %dil, %sil
+       jl .Lcopy_backward
+       subl $0x20, %edx
+.Lcopy_forward_loop:
+       subq $0x20,     %rdx
  
         /*
-        * Move in blocks of 4x16 bytes:
+        * Move in blocks of 4x8 bytes:
          */
-       movq 0*8(%rsi),         %r11
-       movq 1*8(%rsi),         %r8
-       movq %r11,              0*8(%rdi)
-       movq %r8,               1*8(%rdi)
-
-       movq 2*8(%rsi),         %r9
-       movq 3*8(%rsi),         %r10
-       movq %r9,               2*8(%rdi)
-       movq %r10,              3*8(%rdi)
-
-       movq 4*8(%rsi),         %r11
-       movq 5*8(%rsi),         %r8
-       movq %r11,              4*8(%rdi)
-       movq %r8,               5*8(%rdi)
-
-       movq 6*8(%rsi),         %r9
-       movq 7*8(%rsi),         %r10
-       movq %r9,               6*8(%rdi)
-       movq %r10,              7*8(%rdi)
-
-       leaq 64(%rsi), %rsi
-       leaq 64(%rdi), %rdi
-
-       jnz  .Lloop_64
+       movq 0*8(%rsi), %r8
+       movq 1*8(%rsi), %r9
+       movq 2*8(%rsi), %r10
+       movq 3*8(%rsi), %r11
+       leaq 4*8(%rsi), %rsi
+
+       movq %r8,       0*8(%rdi)
+       movq %r9,       1*8(%rdi)
+       movq %r10,      2*8(%rdi)
+       movq %r11,      3*8(%rdi)
+       leaq 4*8(%rdi), %rdi
+       jae  .Lcopy_forward_loop
+       addq $0x20,     %rdx
+       jmp  .Lhandle_tail
+
+.Lcopy_backward:
+       /*
+        * Calculate copy position to tail.
+        */
+       addq %rdx,      %rsi
+       addq %rdx,      %rdi
+       subq $0x20,     %rdx
+       /*
+        * At most 3 ALU operations in one cycle,
+        * so append NOPS in the same 16bytes trunk.
+        */
+       .p2align 4
+.Lcopy_backward_loop:
+       subq $0x20,     %rdx
+       movq -1*8(%rsi),        %r8
+       movq -2*8(%rsi),        %r9
+       movq -3*8(%rsi),        %r10
+       movq -4*8(%rsi),        %r11
+       leaq -4*8(%rsi),        %rsi
+       movq %r8,               -1*8(%rdi)
+       movq %r9,               -2*8(%rdi)
+       movq %r10,              -3*8(%rdi)
+       movq %r11,              -4*8(%rdi)
+       leaq -4*8(%rdi),        %rdi
+       jae  .Lcopy_backward_loop
  
+       /*
+        * Calculate copy position to head.
+        */
+       addq $0x20,     %rdx
+       subq %rdx,      %rsi
+       subq %rdx,      %rdi
  .Lhandle_tail:
-       movl %edx, %ecx
-       andl  $63, %ecx
-       shrl   $3, %ecx
-       jz   .Lhandle_7
+       cmpq $16,       %rdx
+       jb   .Lless_16bytes
  
+       /*
+        * Move data from 16 bytes to 31 bytes.
+        */
+       movq 0*8(%rsi), %r8
+       movq 1*8(%rsi), %r9
+       movq -2*8(%rsi, %rdx),  %r10
+       movq -1*8(%rsi, %rdx),  %r11
+       movq %r8,       0*8(%rdi)
+       movq %r9,       1*8(%rdi)
+       movq %r10,      -2*8(%rdi, %rdx)
+       movq %r11,      -1*8(%rdi, %rdx)
+       retq
         .p2align 4
-.Lloop_8:
-       decl %ecx
-       movq (%rsi),            %r8
-       movq %r8,               (%rdi)
-       leaq 8(%rdi),           %rdi
-       leaq 8(%rsi),           %rsi
-       jnz  .Lloop_8
-
-.Lhandle_7:
-       movl %edx, %ecx
-       andl $7, %ecx
-       jz .Lend
+.Lless_16bytes:
+       cmpq $8,        %rdx
+       jb   .Lless_8bytes
+       /*
+        * Move data from 8 bytes to 15 bytes.
+        */
+       movq 0*8(%rsi), %r8
+       movq -1*8(%rsi, %rdx),  %r9
+       movq %r8,       0*8(%rdi)
+       movq %r9,       -1*8(%rdi, %rdx)
+       retq
+       .p2align 4
+.Lless_8bytes:
+       cmpq $4,        %rdx
+       jb   .Lless_3bytes
  
+       /*
+        * Move data from 4 bytes to 7 bytes.
+        */
+       movl (%rsi), %ecx
+       movl -4(%rsi, %rdx), %r8d
+       movl %ecx, (%rdi)
+       movl %r8d, -4(%rdi, %rdx)
+       retq
         .p2align 4
+.Lless_3bytes:
+       cmpl $0, %edx
+       je .Lend
+       /*
+        * Move data from 1 bytes to 3 bytes.
+        */
  .Lloop_1:
         movb (%rsi), %r8b
         movb %r8b, (%rdi)
         incq %rdi
         incq %rsi
-       decl %ecx
+       decl %edx
         jnz .Lloop_1
  
  .Lend:
-       ret
+       retq
         CFI_ENDPROC
  ENDPROC(memcpy)
  ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c

index 0a33909..6d0f0ec 100644 (file)
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
  #undef memmove
  void *memmove(void *dest, const void *src, size_t count)
  {
-       if (dest < src) {
-               return memcpy(dest, src, count);
-       } else {
-               char *p = dest + count;
-               const char *s = src + count;
-               while (count--)
-                       *--p = *--s;
-       }
-       return dest;
+       unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+       char *ret;
+
+       __asm__ __volatile__(
+               /* Handle more 32bytes in loop */
+               "mov %2, %3\n\t"
+               "cmp $0x20, %0\n\t"
+               "jb     1f\n\t"
+
+               /* Decide forward/backward copy mode */
+               "cmp %2, %1\n\t"
+               "jb     2f\n\t"
+
+               /*
+                * movsq instruction have many startup latency
+                * so we handle small size by general register.
+                */
+               "cmp  $680, %0\n\t"
+               "jb 3f\n\t"
+               /*
+                * movsq instruction is only good for aligned case.
+                */
+               "cmpb %%dil, %%sil\n\t"
+               "je 4f\n\t"
+               "3:\n\t"
+               "sub $0x20, %0\n\t"
+               /*
+                * We gobble 32byts forward in each loop.
+                */
+               "5:\n\t"
+               "sub $0x20, %0\n\t"
+               "movq 0*8(%1), %4\n\t"
+               "movq 1*8(%1), %5\n\t"
+               "movq 2*8(%1), %6\n\t"
+               "movq 3*8(%1), %7\n\t"
+               "leaq 4*8(%1), %1\n\t"
+
+               "movq %4, 0*8(%2)\n\t"
+               "movq %5, 1*8(%2)\n\t"
+               "movq %6, 2*8(%2)\n\t"
+               "movq %7, 3*8(%2)\n\t"
+               "leaq 4*8(%2), %2\n\t"
+               "jae 5b\n\t"
+               "addq $0x20, %0\n\t"
+               "jmp 1f\n\t"
+               /*
+                * Handle data forward by movsq.
+                */
+               ".p2align 4\n\t"
+               "4:\n\t"
+               "movq %0, %8\n\t"
+               "movq -8(%1, %0), %4\n\t"
+               "lea -8(%2, %0), %5\n\t"
+               "shrq $3, %8\n\t"
+               "rep movsq\n\t"
+               "movq %4, (%5)\n\t"
+               "jmp 13f\n\t"
+               /*
+                * Handle data backward by movsq.
+                */
+               ".p2align 4\n\t"
+               "7:\n\t"
+               "movq %0, %8\n\t"
+               "movq (%1), %4\n\t"
+               "movq %2, %5\n\t"
+               "leaq -8(%1, %0), %1\n\t"
+               "leaq -8(%2, %0), %2\n\t"
+               "shrq $3, %8\n\t"
+               "std\n\t"
+               "rep movsq\n\t"
+               "cld\n\t"
+               "movq %4, (%5)\n\t"
+               "jmp 13f\n\t"
+
+               /*
+                * Start to prepare for backward copy.
+                */
+               ".p2align 4\n\t"
+               "2:\n\t"
+               "cmp $680, %0\n\t"
+               "jb 6f \n\t"
+               "cmp %%dil, %%sil\n\t"
+               "je 7b \n\t"
+               "6:\n\t"
+               /*
+                * Calculate copy position to tail.
+                */
+               "addq %0, %1\n\t"
+               "addq %0, %2\n\t"
+               "subq $0x20, %0\n\t"
+               /*
+                * We gobble 32byts backward in each loop.
+                */
+               "8:\n\t"
+               "subq $0x20, %0\n\t"
+               "movq -1*8(%1), %4\n\t"
+               "movq -2*8(%1), %5\n\t"
+               "movq -3*8(%1), %6\n\t"
+               "movq -4*8(%1), %7\n\t"
+               "leaq -4*8(%1), %1\n\t"
+
+               "movq %4, -1*8(%2)\n\t"
+               "movq %5, -2*8(%2)\n\t"
+               "movq %6, -3*8(%2)\n\t"
+               "movq %7, -4*8(%2)\n\t"
+               "leaq -4*8(%2), %2\n\t"
+               "jae 8b\n\t"
+               /*
+                * Calculate copy position to head.
+                */
+               "addq $0x20, %0\n\t"
+               "subq %0, %1\n\t"
+               "subq %0, %2\n\t"
+               "1:\n\t"
+               "cmpq $16, %0\n\t"
+               "jb 9f\n\t"
+               /*
+                * Move data from 16 bytes to 31 bytes.
+                */
+               "movq 0*8(%1), %4\n\t"
+               "movq 1*8(%1), %5\n\t"
+               "movq -2*8(%1, %0), %6\n\t"
+               "movq -1*8(%1, %0), %7\n\t"
+               "movq %4, 0*8(%2)\n\t"
+               "movq %5, 1*8(%2)\n\t"
+               "movq %6, -2*8(%2, %0)\n\t"
+               "movq %7, -1*8(%2, %0)\n\t"
+               "jmp 13f\n\t"
+               ".p2align 4\n\t"
+               "9:\n\t"
+               "cmpq $8, %0\n\t"
+               "jb 10f\n\t"
+               /*
+                * Move data from 8 bytes to 15 bytes.
+                */
+               "movq 0*8(%1), %4\n\t"
+               "movq -1*8(%1, %0), %5\n\t"
+               "movq %4, 0*8(%2)\n\t"
+               "movq %5, -1*8(%2, %0)\n\t"
+               "jmp 13f\n\t"
+               "10:\n\t"
+               "cmpq $4, %0\n\t"
+               "jb 11f\n\t"
+               /*
+                * Move data from 4 bytes to 7 bytes.
+                */
+               "movl (%1), %4d\n\t"
+               "movl -4(%1, %0), %5d\n\t"
+               "movl %4d, (%2)\n\t"
+               "movl %5d, -4(%2, %0)\n\t"
+               "jmp 13f\n\t"
+               "11:\n\t"
+               "cmp $2, %0\n\t"
+               "jb 12f\n\t"
+               /*
+                * Move data from 2 bytes to 3 bytes.
+                */
+               "movw (%1), %4w\n\t"
+               "movw -2(%1, %0), %5w\n\t"
+               "movw %4w, (%2)\n\t"
+               "movw %5w, -2(%2, %0)\n\t"
+               "jmp 13f\n\t"
+               "12:\n\t"
+               "cmp $1, %0\n\t"
+               "jb 13f\n\t"
+               /*
+                * Move data for 1 byte.
+                */
+               "movb (%1), %4b\n\t"
+               "movb %4b, (%2)\n\t"
+               "13:\n\t"
+               : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+                 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+               :"0" (count),
+                "1" (src),
+                "2" (dest)
+               :"memory");
+
+               return ret;
+
  }
  EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c

index a24c6cf..79b0b37 100644 (file)
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
  
                 spin_lock_irqsave(&pgd_lock, flags);
                 list_for_each_entry(page, &pgd_list, lru) {
-                       if (!vmalloc_sync_one(page_address(page), address))
+                       spinlock_t *pgt_lock;
+                       pmd_t *ret;
+
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+
+                       if (!ret)
                                 break;
                 }
                 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -328,29 +337,7 @@ out:
  
  void vmalloc_sync_all(void)
  {
-       unsigned long address;
-
-       for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
-            address += PGDIR_SIZE) {
-
-               const pgd_t *pgd_ref = pgd_offset_k(address);
-               unsigned long flags;
-               struct page *page;
-
-               if (pgd_none(*pgd_ref))
-                       continue;
-
-               spin_lock_irqsave(&pgd_lock, flags);
-               list_for_each_entry(page, &pgd_list, lru) {
-                       pgd_t *pgd;
-                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
-                       if (pgd_none(*pgd))
-                               set_pgd(pgd, *pgd_ref);
-                       else
-                               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-               }
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
  }
  
  /*
@@ -898,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
         if (pmd_large(*pmd))
                 return spurious_fault_check(error_code, (pte_t *) pmd);
  
+       /*
+        * Note: don't use pte_present() here, since it returns true
+        * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+        * _PAGE_GLOBAL bit, which for kernel pages give false positives
+        * when CONFIG_DEBUG_PAGEALLOC is used.
+        */
         pte = pte_offset_kernel(pmd, address);
-       if (!pte_present(*pte))
+       if (!(pte_flags(*pte) & _PAGE_PRESENT))
                 return 0;
  
         ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index bca7909..558f2d3 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
                 panic("alloc_low_page: ran out of memory");
  
         adr = __va(pfn * PAGE_SIZE);
-       memset(adr, 0, PAGE_SIZE);
+       clear_page(adr);
         return adr;
  }
  
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
  
  static inline void save_pg_dir(void)
  {
-       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+       copy_page(swsusp_pg_dir, swapper_pg_dir);
  }
  #else /* !CONFIG_ACPI_SLEEP */
  static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c

index 9a66746..c55f900 100644 (file)
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,6 +97,43 @@ static int __init nonx32_setup(char *str)
  }
  __setup("noexec32=", nonx32_setup);
  
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+       unsigned long address;
+
+       for (address = start; address <= end; address += PGDIR_SIZE) {
+               const pgd_t *pgd_ref = pgd_offset_k(address);
+               unsigned long flags;
+               struct page *page;
+
+               if (pgd_none(*pgd_ref))
+                       continue;
+
+               spin_lock_irqsave(&pgd_lock, flags);
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       spinlock_t *pgt_lock;
+
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+
+                       if (pgd_none(*pgd))
+                               set_pgd(pgd, *pgd_ref);
+                       else
+                               BUG_ON(pgd_page_vaddr(*pgd)
+                                      != pgd_page_vaddr(*pgd_ref));
+
+                       spin_unlock(pgt_lock);
+               }
+               spin_unlock_irqrestore(&pgd_lock, flags);
+       }
+}
+
  /*
   * NOTE: This function is marked __ref because it calls __init function
   * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
                 panic("alloc_low_page: ran out of memory");
  
         adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
-       memset(adr, 0, PAGE_SIZE);
+       clear_page(adr);
         *phys  = pfn * PAGE_SIZE;
         return adr;
  }
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
                              unsigned long end,
                              unsigned long page_size_mask)
  {
-
+       bool pgd_changed = false;
         unsigned long next, last_map_addr = end;
+       unsigned long addr;
  
         start = (unsigned long)__va(start);
         end = (unsigned long)__va(end);
+       addr = start;
  
         for (; start < end; start = next) {
                 pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
                 spin_lock(&init_mm.page_table_lock);
                 pgd_populate(&init_mm, pgd, __va(pud_phys));
                 spin_unlock(&init_mm.page_table_lock);
+               pgd_changed = true;
         }
+
+       if (pgd_changed)
+               sync_global_pgds(addr, end);
+
         __flush_tlb_all();
  
         return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
                 }
  
         }
+       sync_global_pgds((unsigned long)start_page, end);
         return 0;
  }
  
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c

index 970ed57..52d54bf 100644 (file)
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
  #include <asm/numa.h>
  #include <asm/mpspec.h>
  #include <asm/apic.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  
  static struct bootnode __initdata nodes[8];
  static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
  static __init void early_get_boot_cpu_id(void)
  {
         /*
-        * need to get boot_cpu_id so can use that to create apicid_to_node
-        * in k8_scan_nodes()
+        * need to get the APIC ID of the BSP so can use that to
+        * create apicid_to_node in k8_scan_nodes()
          */
  #ifdef CONFIG_X86_MPPARSE
         /*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
         bits = boot_cpu_data.x86_coreid_bits;
         cores = (1<<bits);
         apicid_base = 0;
-       /* need to get boot_cpu_id early for system with apicid lifting */
+       /* get the APIC ID of the BSP early for systems with apicid lifting */
         early_get_boot_cpu_id();
         if (boot_cpu_physical_apicid > 0) {
                 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c

index 63c19e2..324aa3f 100644 (file)
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
                 b == 0xf0 || b == 0xf2 || b == 0xf3
                 /* Group 2 */
                 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-               || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+               || b == 0x64 || b == 0x65
                 /* Group 3 */
                 || b == 0x66
                 /* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c

index a7bcc23..4962f1a 100644 (file)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
  #include <asm/dma.h>
  #include <asm/numa.h>
  #include <asm/acpi.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  
  struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c

index 5c4ee42..8be8c7d 100644 (file)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
  #define UNSHARED_PTRS_PER_PGD                          \
         (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
  
-static void pgd_ctor(pgd_t *pgd)
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+       BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+       virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+       return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
  {
         /* If the pgd points to a shared pagetable level (either the
            ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
                 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
                                 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
                                 KERNEL_PGD_PTRS);
-               paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
-                                        __pa(swapper_pg_dir) >> PAGE_SHIFT,
-                                        KERNEL_PGD_BOUNDARY,
-                                        KERNEL_PGD_PTRS);
         }
  
         /* list required to sync kernel mapping updates */
-       if (!SHARED_KERNEL_PMD)
+       if (!SHARED_KERNEL_PMD) {
+               pgd_set_mm(pgd, mm);
                 pgd_list_add(pgd);
+       }
  }
  
  static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
          */
         spin_lock_irqsave(&pgd_lock, flags);
  
-       pgd_ctor(pgd);
+       pgd_ctor(mm, pgd);
         pgd_prepopulate_pmd(mm, pgd, pmds);
  
         spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index c03f14a..4935848 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
  #include <linux/smp.h>
  #include <linux/interrupt.h>
  #include <linux/module.h>
+#include <linux/cpu.h>
  
  #include <asm/tlbflush.h>
  #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
     want false sharing in the per cpu data segment. */
  static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
  
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
+
  /*
   * We cannot call mmdrop() because we are in interrupt context,
   * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
         union smp_flush_state *f;
  
         /* Caller has disabled preemption */
-       sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+       sender = this_cpu_read(tlb_vector_offset);
         f = &flush_state[sender];
  
         /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
         flush_tlb_others_ipi(cpumask, mm, va);
  }
  
+static void __cpuinit calculate_tlb_offset(void)
+{
+       int cpu, node, nr_node_vecs;
+       /*
+        * we are changing tlb_vector_offset for each CPU in runtime, but this
+        * will not cause inconsistency, as the write is atomic under X86. we
+        * might see more lock contentions in a short time, but after all CPU's
+        * tlb_vector_offset are changed, everything should go normal
+        *
+        * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+        * waste some vectors.
+        **/
+       if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+               nr_node_vecs = 1;
+       else
+               nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+
+       for_each_online_node(node) {
+               int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+                       nr_node_vecs;
+               int cpu_offset = 0;
+               for_each_cpu(cpu, cpumask_of_node(node)) {
+                       per_cpu(tlb_vector_offset, cpu) = node_offset +
+                               cpu_offset;
+                       cpu_offset++;
+                       cpu_offset = cpu_offset % nr_node_vecs;
+               }
+       }
+}
+
+static int tlb_cpuhp_notify(struct notifier_block *n,
+               unsigned long action, void *hcpu)
+{
+       switch (action & 0xf) {
+       case CPU_ONLINE:
+       case CPU_DEAD:
+               calculate_tlb_offset();
+       }
+       return NOTIFY_OK;
+}
+
  static int __cpuinit init_smp_flush(void)
  {
         int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
         for (i = 0; i < ARRAY_SIZE(flush_state); i++)
                 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
  
+       calculate_tlb_offset();
+       hotcpu_notifier(tlb_cpuhp_notify, 0);
         return 0;
  }
  core_initcall(init_smp_flush);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c

index b67a6b5..42fb46f 100644 (file)
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -64,15 +64,22 @@ static u64 ibs_op_ctl;
   * IBS cpuid feature detection
   */
  
-#define IBS_CPUID_FEATURES      0x8000001b
+#define IBS_CPUID_FEATURES             0x8000001b
  
  /*
   * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
   * bit 0 is used to indicate the existence of IBS.
   */
-#define IBS_CAPS_AVAIL                 (1LL<<0)
-#define IBS_CAPS_RDWROPCNT             (1LL<<3)
-#define IBS_CAPS_OPCNT                 (1LL<<4)
+#define IBS_CAPS_AVAIL                 (1U<<0)
+#define IBS_CAPS_RDWROPCNT             (1U<<3)
+#define IBS_CAPS_OPCNT                 (1U<<4)
+
+/*
+ * IBS APIC setup
+ */
+#define IBSCTL                         0x1cc
+#define IBSCTL_LVT_OFFSET_VALID                (1ULL<<8)
+#define IBSCTL_LVT_OFFSET_MASK         0x0F
  
  /*
   * IBS randomization macros
@@ -266,6 +273,74 @@ static void op_amd_stop_ibs(void)
                 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
  }
  
+static inline int eilvt_is_available(int offset)
+{
+       /* check if we may assign a vector */
+       return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int ibs_eilvt_valid(void)
+{
+       u64 val;
+       int offset;
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+               pr_err(FW_BUG "cpu %d, invalid IBS "
+                      "interrupt offset %d (MSR%08X=0x%016llx)",
+                      smp_processor_id(), offset,
+                      MSR_AMD64_IBSCTL, val);
+               return 0;
+       }
+
+       offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+       if (eilvt_is_available(offset))
+               return !0;
+
+       pr_err(FW_BUG "cpu %d, IBS interrupt offset %d "
+              "not available (MSR%08X=0x%016llx)",
+              smp_processor_id(), offset,
+              MSR_AMD64_IBSCTL, val);
+
+       return 0;
+}
+
+static inline int get_ibs_offset(void)
+{
+       u64 val;
+
+       rdmsrl(MSR_AMD64_IBSCTL, val);
+       if (!(val & IBSCTL_LVT_OFFSET_VALID))
+               return -EINVAL;
+
+       return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void)
+{
+       int offset;
+
+       offset = get_ibs_offset();
+       if (offset < 0)
+               goto failed;
+
+       if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+               return;
+failed:
+       pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
+               smp_processor_id());
+}
+
+static void clear_APIC_ibs(void)
+{
+       int offset;
+
+       offset = get_ibs_offset();
+       if (offset >= 0)
+               setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
  #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
  
  static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +451,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
         }
  
         if (ibs_caps)
-               setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+               setup_APIC_ibs();
  }
  
  static void op_amd_cpu_shutdown(void)
  {
         if (ibs_caps)
-               setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
+               clear_APIC_ibs();
  }
  
  static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +520,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
         op_amd_stop_ibs();
  }
  
-static int __init_ibs_nmi(void)
+static int setup_ibs_ctl(int ibs_eilvt_off)
  {
-#define IBSCTL_LVTOFFSETVAL            (1 << 8)
-#define IBSCTL                         0x1cc
         struct pci_dev *cpu_cfg;
         int nodes;
         u32 value = 0;
-       u8 ibs_eilvt_off;
-
-       ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
  
         nodes = 0;
         cpu_cfg = NULL;
@@ -466,21 +536,60 @@ static int __init_ibs_nmi(void)
                         break;
                 ++nodes;
                 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-                                      | IBSCTL_LVTOFFSETVAL);
+                                      | IBSCTL_LVT_OFFSET_VALID);
                 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-               if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+               if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
                         pci_dev_put(cpu_cfg);
                         printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-                               "IBSCTL = 0x%08x", value);
-                       return 1;
+                              "IBSCTL = 0x%08x\n", value);
+                       return -EINVAL;
                 }
         } while (1);
  
         if (!nodes) {
-               printk(KERN_DEBUG "No CPU node configured for IBS");
-               return 1;
+               printk(KERN_DEBUG "No CPU node configured for IBS\n");
+               return -ENODEV;
+       }
+
+       return 0;
+}
+
+static int force_ibs_eilvt_setup(void)
+{
+       int i;
+       int ret;
+
+       /* find the next free available EILVT entry */
+       for (i = 1; i < 4; i++) {
+               if (!eilvt_is_available(i))
+                       continue;
+               ret = setup_ibs_ctl(i);
+               if (ret)
+                       return ret;
+               return 0;
         }
  
+       printk(KERN_DEBUG "No EILVT entry available\n");
+
+       return -EBUSY;
+}
+
+static int __init_ibs_nmi(void)
+{
+       int ret;
+
+       if (ibs_eilvt_valid())
+               return 0;
+
+       ret = force_ibs_eilvt_setup();
+       if (ret)
+               return ret;
+
+       if (!ibs_eilvt_valid())
+               return -EFAULT;
+
+       pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+
         return 0;
  }
  
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c

index b348154..13700ec 100644 (file)
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
  
  int __init pci_olpc_init(void)
  {
-       printk(KERN_INFO "PCI: Using configuration type OLPC\n");
+       printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
         raw_pci_ops = &pci_olpc_conf;
         is_lx = is_geode_lx();
         return 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c

index 42086ac..b2363fc 100644 (file)
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .alloc_pte = xen_alloc_pte_init,
         .release_pte = xen_release_pte_init,
         .alloc_pmd = xen_alloc_pmd_init,
-       .alloc_pmd_clone = paravirt_nop,
         .release_pmd = xen_release_pmd_init,
  
  #ifdef CONFIG_X86_64
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c

index c64a5d3..8750888 100644 (file)
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -92,7 +92,7 @@ int show_interrupts(struct seq_file *p, void *v)
                 for_each_online_cpu(j)
                         seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
  #endif
-               seq_printf(p, " %14s", irq_desc[i].chip->typename);
+               seq_printf(p, " %14s", irq_desc[i].chip->name);
                 seq_printf(p, "  %s", action->name);
  
                 for (action=action->next; action; action = action->next)
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c

index 6b115f6..6afceb3 100644 (file)
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -30,18 +30,13 @@
  #include <linux/slab.h>
  #include <acpi/acpi_bus.h>
  #include <acpi/acpi_drivers.h>
+#include <asm/mwait.h>
  
  #define ACPI_PROCESSOR_AGGREGATOR_CLASS        "acpi_pad"
  #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
  #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
  static DEFINE_MUTEX(isolated_cpus_lock);
  
-#define MWAIT_SUBSTATE_MASK    (0xf)
-#define MWAIT_CSTATE_MASK      (0xf)
-#define MWAIT_SUBSTATE_SIZE    (4)
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK     (0x2)
  static unsigned long power_saving_mwait_eax;
  
  static unsigned char tsc_detected_unstable;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c

index 9fc630c..f6f37a0 100644 (file)
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev,            \
         return sprintf(buf, "%d\n", topology_##name(cpu));      \
  }
  
-#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
+    defined(topology_book_cpumask)
  static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
  {
         ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
  define_one_ro_named(core_siblings, show_core_cpumask);
  define_one_ro_named(core_siblings_list, show_core_cpumask_list);
  
+#ifdef CONFIG_SCHED_BOOK
+define_id_show_func(book_id);
+define_one_ro(book_id);
+define_siblings_show_func(book_cpumask);
+define_one_ro_named(book_siblings, show_book_cpumask);
+define_one_ro_named(book_siblings_list, show_book_cpumask_list);
+#endif
+
  static struct attribute *default_attrs[] = {
         &attr_physical_package_id.attr,
         &attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
         &attr_thread_siblings_list.attr,
         &attr_core_siblings.attr,
         &attr_core_siblings_list.attr,
+#ifdef CONFIG_SCHED_BOOK
+       &attr_book_id.attr,
+       &attr_book_siblings.attr,
+       &attr_book_siblings_list.attr,
+#endif
         NULL
  };
  
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index de27768..4b9359a 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
  
           If unsure, say N.
  
+config BLK_DEV_RBD
+       tristate "Rados block device (RBD)"
+       depends on INET && EXPERIMENTAL && BLOCK
+       select CEPH_LIB
+       select LIBCRC32C
+       select CRYPTO_AES
+       select CRYPTO
+       default n
+       help
+         Say Y here if you want include the Rados block device, which stripes
+         a block device over objects stored in the Ceph distributed object
+         store.
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
  endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile

index aff5ac9..d7f463d 100644 (file)
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD)      += hd.o
  
  obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
  obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
  
  swim_mod-objs  := swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c

new file mode 100644 (file)

index 0000000..6ec9d53
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   Instructions for use
+   --------------------
+
+   1) Map a Linux block device to an existing rbd image.
+
+      Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+      $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+      The snapshot name can be "-" or omitted to map the image read/write.
+
+   2) List all active blkdev<->object mappings.
+
+      In this example, we have performed step #1 twice, creating two blkdevs,
+      mapped to two separate rados objects in the rados rbd pool
+
+      $ cat /sys/class/rbd/list
+      #id     major   client_name     pool    name    snap    KB
+      0       254     client4143      rbd     foo     -      1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - blkdev assigned major
+      - rados client id
+      - rados pool name
+      - rados block device name
+      - mapped snapshot ("-" if none)
+      - device size in KB
+
+
+   3) Create a snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+   4) Listing a snapshot.
+
+      $ cat /sys/class/rbd/snaps_list
+      #id     snap    KB
+      0       -       1024000 (*)
+      0       foo     1024000
+
+      The columns, in order, are:
+      - blkdev unique id
+      - snapshot name, '-' means none (active read/write version)
+      - size of device at time of snapshot
+      - the (*) indicates this is the active version
+
+   5) Rollback to snapshot.
+
+      Usage: <blkdev id> <snapname>
+
+      $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+   6) Mapping an image using snapshot.
+
+      A snapshot mapping is read-only. This is being done by passing
+      snap=<snapname> to the options when adding a device.
+
+      $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+   7) Remove an active blkdev<->rbd image mapping.
+
+      In this example, we remove the mapping with blkdev unique id 1.
+
+      $ echo 1 > /sys/class/rbd/remove
+
+
+   NOTE:  The actual creation and deletion of rados objects is outside the scope
+   of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR   256             /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN    (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN  64
+#define RBD_MAX_SNAP_NAME_LEN  32
+#define RBD_MAX_OPT_LEN                1024
+
+#define RBD_SNAP_HEAD_NAME     "-"
+
+#define DEV_NAME_LEN           32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+       u64 image_size;
+       char block_name[32];
+       __u8 obj_order;
+       __u8 crypt_type;
+       __u8 comp_type;
+       struct rw_semaphore snap_rwsem;
+       struct ceph_snap_context *snapc;
+       size_t snap_names_len;
+       u64 snap_seq;
+       u32 total_snaps;
+
+       char *snap_names;
+       u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client.  multiple devices may share a client.
+ */
+struct rbd_client {
+       struct ceph_client      *client;
+       struct kref             kref;
+       struct list_head        node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+       struct request          *rq;            /* blk layer request */
+       struct bio              *bio;           /* cloned bio */
+       struct page             **pages;        /* list of used pages */
+       u64                     len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+       int                     id;             /* blkdev unique id */
+
+       int                     major;          /* blkdev assigned major */
+       struct gendisk          *disk;          /* blkdev's gendisk and rq */
+       struct request_queue    *q;
+
+       struct ceph_client      *client;
+       struct rbd_client       *rbd_client;
+
+       char                    name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+       spinlock_t              lock;           /* queue lock */
+
+       struct rbd_image_header header;
+       char                    obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+       int                     obj_len;
+       char                    obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+       char                    pool_name[RBD_MAX_POOL_NAME_LEN];
+       int                     poolid;
+
+       char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u32 cur_snap;   /* index+1 of current snapshot within snap context
+                          0 - for the head */
+       int read_only;
+
+       struct list_head        node;
+};
+
+static spinlock_t node_lock;      /* protects client get/put */
+
+static struct class *class_rbd;          /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex);          /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static LIST_HEAD(rbd_client_list);      /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+       struct rbd_device *rbd_dev = disk->private_data;
+
+       set_device_ro(bdev, rbd_dev->read_only);
+
+       if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+               return -EROFS;
+
+       return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+       .owner                  = THIS_MODULE,
+       .open                   = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+       struct rbd_client *rbdc;
+       int ret = -ENOMEM;
+
+       dout("rbd_client_create\n");
+       rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+       if (!rbdc)
+               goto out_opt;
+
+       kref_init(&rbdc->kref);
+       INIT_LIST_HEAD(&rbdc->node);
+
+       rbdc->client = ceph_create_client(opt, rbdc);
+       if (IS_ERR(rbdc->client))
+               goto out_rbdc;
+       opt = NULL; /* Now rbdc->client is responsible for opt */
+
+       ret = ceph_open_session(rbdc->client);
+       if (ret < 0)
+               goto out_err;
+
+       spin_lock(&node_lock);
+       list_add_tail(&rbdc->node, &rbd_client_list);
+       spin_unlock(&node_lock);
+
+       dout("rbd_client_create created %p\n", rbdc);
+       return rbdc;
+
+out_err:
+       ceph_destroy_client(rbdc->client);
+out_rbdc:
+       kfree(rbdc);
+out_opt:
+       if (opt)
+               ceph_destroy_options(opt);
+       return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+       struct rbd_client *client_node;
+
+       if (opt->flags & CEPH_OPT_NOSHARE)
+               return NULL;
+
+       list_for_each_entry(client_node, &rbd_client_list, node)
+               if (ceph_compare_options(opt, client_node->client) == 0)
+                       return client_node;
+       return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+                         char *options)
+{
+       struct rbd_client *rbdc;
+       struct ceph_options *opt;
+       int ret;
+
+       ret = ceph_parse_options(&opt, options, mon_addr,
+                                mon_addr + strlen(mon_addr), NULL, NULL);
+       if (ret < 0)
+               return ret;
+
+       spin_lock(&node_lock);
+       rbdc = __rbd_client_find(opt);
+       if (rbdc) {
+               ceph_destroy_options(opt);
+
+               /* using an existing client */
+               kref_get(&rbdc->kref);
+               rbd_dev->rbd_client = rbdc;
+               rbd_dev->client = rbdc->client;
+               spin_unlock(&node_lock);
+               return 0;
+       }
+       spin_unlock(&node_lock);
+
+       rbdc = rbd_client_create(opt);
+       if (IS_ERR(rbdc))
+               return PTR_ERR(rbdc);
+
+       rbd_dev->rbd_client = rbdc;
+       rbd_dev->client = rbdc->client;
+       return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+       struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+       dout("rbd_release_client %p\n", rbdc);
+       spin_lock(&node_lock);
+       list_del(&rbdc->node);
+       spin_unlock(&node_lock);
+
+       ceph_destroy_client(rbdc->client);
+       kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+       kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+       rbd_dev->rbd_client = NULL;
+       rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+                                struct rbd_image_header_ondisk *ondisk,
+                                int allocated_snaps,
+                                gfp_t gfp_flags)
+{
+       int i;
+       u32 snap_count = le32_to_cpu(ondisk->snap_count);
+       int ret = -ENOMEM;
+
+       init_rwsem(&header->snap_rwsem);
+
+       header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+       header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+                               snap_count *
+                                sizeof(struct rbd_image_snap_ondisk),
+                               gfp_flags);
+       if (!header->snapc)
+               return -ENOMEM;
+       if (snap_count) {
+               header->snap_names = kmalloc(header->snap_names_len,
+                                            GFP_KERNEL);
+               if (!header->snap_names)
+                       goto err_snapc;
+               header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+                                            GFP_KERNEL);
+               if (!header->snap_sizes)
+                       goto err_names;
+       } else {
+               header->snap_names = NULL;
+               header->snap_sizes = NULL;
+       }
+       memcpy(header->block_name, ondisk->block_name,
+              sizeof(ondisk->block_name));
+
+       header->image_size = le64_to_cpu(ondisk->image_size);
+       header->obj_order = ondisk->options.order;
+       header->crypt_type = ondisk->options.crypt_type;
+       header->comp_type = ondisk->options.comp_type;
+
+       atomic_set(&header->snapc->nref, 1);
+       header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+       header->snapc->num_snaps = snap_count;
+       header->total_snaps = snap_count;
+
+       if (snap_count &&
+           allocated_snaps == snap_count) {
+               for (i = 0; i < snap_count; i++) {
+                       header->snapc->snaps[i] =
+                               le64_to_cpu(ondisk->snaps[i].id);
+                       header->snap_sizes[i] =
+                               le64_to_cpu(ondisk->snaps[i].image_size);
+               }
+
+               /* copy snapshot names */
+               memcpy(header->snap_names, &ondisk->snaps[i],
+                       header->snap_names_len);
+       }
+
+       return 0;
+
+err_names:
+       kfree(header->snap_names);
+err_snapc:
+       kfree(header->snapc);
+       return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+       return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+       struct rbd_image_header *header = &rbd_dev->header;
+
+       if (!rbd_dev->cur_snap)
+               return 0;
+
+       return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+                       u64 *seq, u64 *size)
+{
+       int i;
+       char *p = header->snap_names;
+
+       for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+               if (strcmp(snap_name, p) == 0)
+                       break;
+       }
+       if (i == header->total_snaps)
+               return -ENOENT;
+       if (seq)
+               *seq = header->snapc->snaps[i];
+
+       if (size)
+               *size = header->snap_sizes[i];
+
+       return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              u64 *size)
+{
+       struct rbd_image_header *header = &dev->header;
+       struct ceph_snap_context *snapc = header->snapc;
+       int ret = -ENOENT;
+
+       down_write(&header->snap_rwsem);
+
+       if (!snap_name ||
+           !*snap_name ||
+           strcmp(snap_name, "-") == 0 ||
+           strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+               if (header->total_snaps)
+                       snapc->seq = header->snap_seq;
+               else
+                       snapc->seq = 0;
+               dev->cur_snap = 0;
+               dev->read_only = 0;
+               if (size)
+                       *size = header->image_size;
+       } else {
+               ret = snap_by_name(header, snap_name, &snapc->seq, size);
+               if (ret < 0)
+                       goto done;
+
+               dev->cur_snap = header->total_snaps - ret;
+               dev->read_only = 1;
+       }
+
+       ret = 0;
+done:
+       up_write(&header->snap_rwsem);
+       return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+       kfree(header->snapc);
+       kfree(header->snap_names);
+       kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+                          const char *block_name,
+                          u64 ofs, u64 len,
+                          char *seg_name, u64 *segofs)
+{
+       u64 seg = ofs >> header->obj_order;
+
+       if (seg_name)
+               snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+                        "%s.%012llx", block_name, seg);
+
+       ofs = ofs & ((1 << header->obj_order) - 1);
+       len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+       if (segofs)
+               *segofs = ofs;
+
+       return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+       struct bio *tmp;
+
+       while (chain) {
+               tmp = chain;
+               chain = chain->bi_next;
+               bio_put(tmp);
+       }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+       struct bio_vec *bv;
+       unsigned long flags;
+       void *buf;
+       int i;
+       int pos = 0;
+
+       while (chain) {
+               bio_for_each_segment(bv, chain, i) {
+                       if (pos + bv->bv_len > start_ofs) {
+                               int remainder = max(start_ofs - pos, 0);
+                               buf = bvec_kmap_irq(bv, &flags);
+                               memset(buf + remainder, 0,
+                                      bv->bv_len - remainder);
+                               bvec_kunmap_irq(buf, &flags);
+                       }
+                       pos += bv->bv_len;
+               }
+
+               chain = chain->bi_next;
+       }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+                                  struct bio_pair **bp,
+                                  int len, gfp_t gfpmask)
+{
+       struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+       int total = 0;
+
+       if (*bp) {
+               bio_pair_release(*bp);
+               *bp = NULL;
+       }
+
+       while (old_chain && (total < len)) {
+               tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+               if (!tmp)
+                       goto err_out;
+
+               if (total + old_chain->bi_size > len) {
+                       struct bio_pair *bp;
+
+                       /*
+                        * this split can only happen with a single paged bio,
+                        * split_bio will BUG_ON if this is not the case
+                        */
+                       dout("bio_chain_clone split! total=%d remaining=%d"
+                            "bi_size=%d\n",
+                            (int)total, (int)len-total,
+                            (int)old_chain->bi_size);
+
+                       /* split the bio. We'll release it either in the next
+                          call, or it will have to be released outside */
+                       bp = bio_split(old_chain, (len - total) / 512ULL);
+                       if (!bp)
+                               goto err_out;
+
+                       __bio_clone(tmp, &bp->bio1);
+
+                       *next = &bp->bio2;
+               } else {
+                       __bio_clone(tmp, old_chain);
+                       *next = old_chain->bi_next;
+               }
+
+               tmp->bi_bdev = NULL;
+               gfpmask &= ~__GFP_WAIT;
+               tmp->bi_next = NULL;
+
+               if (!new_chain) {
+                       new_chain = tail = tmp;
+               } else {
+                       tail->bi_next = tmp;
+                       tail = tmp;
+               }
+               old_chain = old_chain->bi_next;
+
+               total += tmp->bi_size;
+       }
+
+       BUG_ON(total < len);
+
+       if (tail)
+               tail->bi_next = NULL;
+
+       *old = old_chain;
+
+       return new_chain;
+
+err_out:
+       dout("bio_chain_clone with err\n");
+       bio_chain_put(new_chain);
+       return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+                           int num_ops,
+                           int opcode,
+                           u32 payload_len)
+{
+       *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+                      GFP_NOIO);
+       if (!*ops)
+               return -ENOMEM;
+       (*ops)[0].op = opcode;
+       /*
+        * op extent offset and length will be set later on
+        * in calc_raw_layout()
+        */
+       (*ops)[0].payload_len = payload_len;
+       return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+       kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+                         struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj, u64 ofs, u64 len,
+                         struct bio *bio,
+                         struct page **pages,
+                         int num_pages,
+                         int flags,
+                         struct ceph_osd_req_op *ops,
+                         int num_reply,
+                         void (*rbd_cb)(struct ceph_osd_request *req,
+                                        struct ceph_msg *msg))
+{
+       struct ceph_osd_request *req;
+       struct ceph_file_layout *layout;
+       int ret;
+       u64 bno;
+       struct timespec mtime = CURRENT_TIME;
+       struct rbd_request *req_data;
+       struct ceph_osd_request_head *reqhead;
+       struct rbd_image_header *header = &dev->header;
+
+       ret = -ENOMEM;
+       req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+       if (!req_data)
+               goto done;
+
+       dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+       down_read(&header->snap_rwsem);
+
+       req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+                                     snapc,
+                                     ops,
+                                     false,
+                                     GFP_NOIO, pages, bio);
+       if (IS_ERR(req)) {
+               up_read(&header->snap_rwsem);
+               ret = PTR_ERR(req);
+               goto done_pages;
+       }
+
+       req->r_callback = rbd_cb;
+
+       req_data->rq = rq;
+       req_data->bio = bio;
+       req_data->pages = pages;
+       req_data->len = len;
+
+       req->r_priv = req_data;
+
+       reqhead = req->r_request->front.iov_base;
+       reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+       strncpy(req->r_oid, obj, sizeof(req->r_oid));
+       req->r_oid_len = strlen(req->r_oid);
+
+       layout = &req->r_file_layout;
+       memset(layout, 0, sizeof(*layout));
+       layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_stripe_count = cpu_to_le32(1);
+       layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+       layout->fl_pg_preferred = cpu_to_le32(-1);
+       layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+       ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+                            ofs, &len, &bno, req, ops);
+
+       ceph_osdc_build_request(req, ofs, &len,
+                               ops,
+                               snapc,
+                               &mtime,
+                               req->r_oid, req->r_oid_len);
+       up_read(&header->snap_rwsem);
+
+       ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+       if (ret < 0)
+               goto done_err;
+
+       if (!rbd_cb) {
+               ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+               ceph_osdc_put_request(req);
+       }
+       return ret;
+
+done_err:
+       bio_chain_put(req_data->bio);
+       ceph_osdc_put_request(req);
+done_pages:
+       kfree(req_data);
+done:
+       if (rq)
+               blk_end_request(rq, ret, len);
+       return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+       struct rbd_request *req_data = req->r_priv;
+       struct ceph_osd_reply_head *replyhead;
+       struct ceph_osd_op *op;
+       __s32 rc;
+       u64 bytes;
+       int read_op;
+
+       /* parse reply */
+       replyhead = msg->front.iov_base;
+       WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+       op = (void *)(replyhead + 1);
+       rc = le32_to_cpu(replyhead->result);
+       bytes = le64_to_cpu(op->extent.length);
+       read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+       dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+       if (rc == -ENOENT && read_op) {
+               zero_bio_chain(req_data->bio, 0);
+               rc = 0;
+       } else if (rc == 0 && read_op && bytes < req_data->len) {
+               zero_bio_chain(req_data->bio, bytes);
+               bytes = req_data->len;
+       }
+
+       blk_end_request(req_data->rq, rc, bytes);
+
+       if (req_data->bio)
+               bio_chain_put(req_data->bio);
+
+       ceph_osdc_put_request(req);
+       kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+                          struct ceph_snap_context *snapc,
+                          u64 snapid,
+                          int opcode,
+                          int flags,
+                          struct ceph_osd_req_op *orig_ops,
+                          int num_reply,
+                          const char *obj,
+                          u64 ofs, u64 len,
+                          char *buf)
+{
+       int ret;
+       struct page **pages;
+       int num_pages;
+       struct ceph_osd_req_op *ops = orig_ops;
+       u32 payload_len;
+
+       num_pages = calc_pages_for(ofs , len);
+       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+       if (IS_ERR(pages))
+               return PTR_ERR(pages);
+
+       if (!orig_ops) {
+               payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+               ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+               if (ret < 0)
+                       goto done;
+
+               if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+                       ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+                       if (ret < 0)
+                               goto done_ops;
+               }
+       }
+
+       ret = rbd_do_request(NULL, dev, snapc, snapid,
+                         obj, ofs, len, NULL,
+                         pages, num_pages,
+                         flags,
+                         ops,
+                         2,
+                         NULL);
+       if (ret < 0)
+               goto done_ops;
+
+       if ((flags & CEPH_OSD_FLAG_READ) && buf)
+               ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+       if (!orig_ops)
+               rbd_destroy_ops(ops);
+done:
+       ceph_release_page_vector(pages, num_pages);
+       return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+                    struct rbd_device *rbd_dev ,
+                    struct ceph_snap_context *snapc,
+                    u64 snapid,
+                    int opcode, int flags, int num_reply,
+                    u64 ofs, u64 len,
+                    struct bio *bio)
+{
+       char *seg_name;
+       u64 seg_ofs;
+       u64 seg_len;
+       int ret;
+       struct ceph_osd_req_op *ops;
+       u32 payload_len;
+
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return -ENOMEM;
+
+       seg_len = rbd_get_segment(&rbd_dev->header,
+                                 rbd_dev->header.block_name,
+                                 ofs, len,
+                                 seg_name, &seg_ofs);
+
+       payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+       ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+       if (ret < 0)
+               goto done;
+
+       /* we've taken care of segment sizes earlier when we
+          cloned the bios. We should never have a segment
+          truncated at this point */
+       BUG_ON(seg_len < len);
+
+       ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+                            seg_name, seg_ofs, seg_len,
+                            bio,
+                            NULL, 0,
+                            flags,
+                            ops,
+                            num_reply,
+                            rbd_req_cb);
+done:
+       kfree(seg_name);
+       return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        struct ceph_snap_context *snapc,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+                        CEPH_OSD_OP_WRITE,
+                        CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+                        struct rbd_device *rbd_dev,
+                        u64 snapid,
+                        u64 ofs, u64 len,
+                        struct bio *bio)
+{
+       return rbd_do_op(rq, rbd_dev, NULL,
+                        (snapid ? snapid : CEPH_NOSNAP),
+                        CEPH_OSD_OP_READ,
+                        CEPH_OSD_FLAG_READ,
+                        2,
+                        ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+                         struct ceph_snap_context *snapc,
+                         u64 snapid,
+                         const char *obj,
+                         u64 ofs, u64 len,
+                         char *buf)
+{
+       return rbd_req_sync_op(dev, NULL,
+                              (snapid ? snapid : CEPH_NOSNAP),
+                              CEPH_OSD_OP_READ,
+                              CEPH_OSD_FLAG_READ,
+                              NULL,
+                              1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+                                    u64 snapid,
+                                    const char *obj)
+{
+       struct ceph_osd_req_op *ops;
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+       if (ret < 0)
+               return ret;
+
+       ops[0].snap.snapid = snapid;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       if (ret < 0)
+               return ret;
+
+       return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+                            const char *obj,
+                            const char *cls,
+                            const char *method,
+                            const char *data,
+                            int len)
+{
+       struct ceph_osd_req_op *ops;
+       int cls_len = strlen(cls);
+       int method_len = strlen(method);
+       int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+                                   cls_len + method_len + len);
+       if (ret < 0)
+               return ret;
+
+       ops[0].cls.class_name = cls;
+       ops[0].cls.class_len = (__u8)cls_len;
+       ops[0].cls.method_name = method;
+       ops[0].cls.method_len = (__u8)method_len;
+       ops[0].cls.argc = 0;
+       ops[0].cls.indata = data;
+       ops[0].cls.indata_len = len;
+
+       ret = rbd_req_sync_op(dev, NULL,
+                              CEPH_NOSNAP,
+                              0,
+                              CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+                              ops,
+                              1, obj, 0, 0, NULL);
+
+       rbd_destroy_ops(ops);
+
+       dout("cls_exec returned %d\n", ret);
+       return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       struct request *rq;
+       struct bio_pair *bp = NULL;
+
+       rq = blk_fetch_request(q);
+
+       while (1) {
+               struct bio *bio;
+               struct bio *rq_bio, *next_bio = NULL;
+               bool do_write;
+               int size, op_size = 0;
+               u64 ofs;
+
+               /* peek at request from block layer */
+               if (!rq)
+                       break;
+
+               dout("fetched request\n");
+
+               /* filter out block requests we don't understand */
+               if ((rq->cmd_type != REQ_TYPE_FS)) {
+                       __blk_end_request_all(rq, 0);
+                       goto next;
+               }
+
+               /* deduce our operation (read, write) */
+               do_write = (rq_data_dir(rq) == WRITE);
+
+               size = blk_rq_bytes(rq);
+               ofs = blk_rq_pos(rq) * 512ULL;
+               rq_bio = rq->bio;
+               if (do_write && rbd_dev->read_only) {
+                       __blk_end_request_all(rq, -EROFS);
+                       goto next;
+               }
+
+               spin_unlock_irq(q->queue_lock);
+
+               dout("%s 0x%x bytes at 0x%llx\n",
+                    do_write ? "write" : "read",
+                    size, blk_rq_pos(rq) * 512ULL);
+
+               do {
+                       /* a bio clone to be passed down to OSD req */
+                       dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+                       op_size = rbd_get_segment(&rbd_dev->header,
+                                                 rbd_dev->header.block_name,
+                                                 ofs, size,
+                                                 NULL, NULL);
+                       bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+                                             op_size, GFP_ATOMIC);
+                       if (!bio) {
+                               spin_lock_irq(q->queue_lock);
+                               __blk_end_request_all(rq, -ENOMEM);
+                               goto next;
+                       }
+
+                       /* init OSD command: write or read */
+                       if (do_write)
+                               rbd_req_write(rq, rbd_dev,
+                                             rbd_dev->header.snapc,
+                                             ofs,
+                                             op_size, bio);
+                       else
+                               rbd_req_read(rq, rbd_dev,
+                                            cur_snap_id(rbd_dev),
+                                            ofs,
+                                            op_size, bio);
+
+                       size -= op_size;
+                       ofs += op_size;
+
+                       rq_bio = next_bio;
+               } while (size > 0);
+
+               if (bp)
+                       bio_pair_release(bp);
+
+               spin_lock_irq(q->queue_lock);
+next:
+               rq = blk_fetch_request(q);
+       }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+                         struct bio_vec *bvec)
+{
+       struct rbd_device *rbd_dev = q->queuedata;
+       unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+       sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+       unsigned int bio_sectors = bmd->bi_size >> 9;
+       int max;
+
+       max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
+                                + bio_sectors)) << 9;
+       if (max < 0)
+               max = 0; /* bio_add cannot handle a negative return */
+       if (max <= bvec->bv_len && bio_sectors == 0)
+               return bvec->bv_len;
+       return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk = rbd_dev->disk;
+
+       if (!disk)
+               return;
+
+       rbd_header_free(&rbd_dev->header);
+
+       if (disk->flags & GENHD_FL_UP)
+               del_gendisk(disk);
+       if (disk->queue)
+               blk_cleanup_queue(disk->queue);
+       put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header 
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+                          struct rbd_image_header *header)
+{
+       ssize_t rc;
+       struct rbd_image_header_ondisk *dh;
+       int snap_count = 0;
+       u64 snap_names_len = 0;
+
+       while (1) {
+               int len = sizeof(*dh) +
+                         snap_count * sizeof(struct rbd_image_snap_ondisk) +
+                         snap_names_len;
+
+               rc = -ENOMEM;
+               dh = kmalloc(len, GFP_KERNEL);
+               if (!dh)
+                       return -ENOMEM;
+
+               rc = rbd_req_sync_read(rbd_dev,
+                                      NULL, CEPH_NOSNAP,
+                                      rbd_dev->obj_md_name,
+                                      0, len,
+                                      (char *)dh);
+               if (rc < 0)
+                       goto out_dh;
+
+               rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+               if (rc < 0)
+                       goto out_dh;
+
+               if (snap_count != header->total_snaps) {
+                       snap_count = header->total_snaps;
+                       snap_names_len = header->snap_names_len;
+                       rbd_header_free(header);
+                       kfree(dh);
+                       continue;
+               }
+               break;
+       }
+
+out_dh:
+       kfree(dh);
+       return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+                              const char *snap_name,
+                              gfp_t gfp_flags)
+{
+       int name_len = strlen(snap_name);
+       u64 new_snapid;
+       int ret;
+       void *data, *data_start, *data_end;
+
+       /* we should create a snapshot only if we're pointing at the head */
+       if (dev->cur_snap)
+               return -EINVAL;
+
+       ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+                                     &new_snapid);
+       dout("created snapid=%lld\n", new_snapid);
+       if (ret < 0)
+               return ret;
+
+       data = kmalloc(name_len + 16, gfp_flags);
+       if (!data)
+               return -ENOMEM;
+
+       data_start = data;
+       data_end = data + name_len + 16;
+
+       ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+       ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+       ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+                               data_start, data - data_start);
+
+       kfree(data_start);
+
+       if (ret < 0)
+               return ret;
+
+       dev->header.snapc->seq =  new_snapid;
+
+       return 0;
+bad:
+       return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+       int ret;
+       struct rbd_image_header h;
+       u64 snap_seq;
+
+       ret = rbd_read_header(rbd_dev, &h);
+       if (ret < 0)
+               return ret;
+
+       down_write(&rbd_dev->header.snap_rwsem);
+
+       snap_seq = rbd_dev->header.snapc->seq;
+
+       kfree(rbd_dev->header.snapc);
+       kfree(rbd_dev->header.snap_names);
+       kfree(rbd_dev->header.snap_sizes);
+
+       rbd_dev->header.total_snaps = h.total_snaps;
+       rbd_dev->header.snapc = h.snapc;
+       rbd_dev->header.snap_names = h.snap_names;
+       rbd_dev->header.snap_sizes = h.snap_sizes;
+       rbd_dev->header.snapc->seq = snap_seq;
+
+       up_write(&rbd_dev->header.snap_rwsem);
+
+       return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+       struct gendisk *disk;
+       struct request_queue *q;
+       int rc;
+       u64 total_size = 0;
+
+       /* contact OSD, request size info about the object being mapped */
+       rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+       if (rc)
+               return rc;
+
+       rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+       if (rc)
+               return rc;
+
+       /* create gendisk info */
+       rc = -ENOMEM;
+       disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+       if (!disk)
+               goto out;
+
+       sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+       disk->major = rbd_dev->major;
+       disk->first_minor = 0;
+       disk->fops = &rbd_bd_ops;
+       disk->private_data = rbd_dev;
+
+       /* init rq */
+       rc = -ENOMEM;
+       q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+       if (!q)
+               goto out_disk;
+       blk_queue_merge_bvec(q, rbd_merge_bvec);
+       disk->queue = q;
+
+       q->queuedata = rbd_dev;
+
+       rbd_dev->disk = disk;
+       rbd_dev->q = q;
+
+       /* finally, announce the disk to the world */
+       set_capacity(disk, total_size / 512ULL);
+       add_disk(disk);
+
+       pr_info("%s: added with size 0x%llx\n",
+               disk->disk_name, (unsigned long long)total_size);
+       return 0;
+
+out_disk:
+       put_disk(disk);
+out:
+       return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ *                   add       map rados objects to blkdev
+ *                   remove    unmap rados objects
+ *                   list      show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+       kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       int n = 0;
+       struct list_head *tmp;
+       int max = PAGE_SIZE;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max,
+                     "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               n += snprintf(data+n, max-n,
+                             "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+                             rbd_dev->id,
+                             rbd_dev->major,
+                             ceph_client_id(rbd_dev->client),
+                             rbd_dev->pool_name,
+                             rbd_dev->obj, rbd_dev->snap_name,
+                             rbd_dev->header.image_size >> 10);
+               if (n == max)
+                       break;
+       }
+
+       mutex_unlock(&ctl_mutex);
+       return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+                            struct class_attribute *attr,
+                            const char *buf, size_t count)
+{
+       struct ceph_osd_client *osdc;
+       struct rbd_device *rbd_dev;
+       ssize_t rc = -ENOMEM;
+       int irc, new_id = 0;
+       struct list_head *tmp;
+       char *mon_dev_name;
+       char *options;
+
+       if (!try_module_get(THIS_MODULE))
+               return -ENODEV;
+
+       mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!mon_dev_name)
+               goto err_out_mod;
+
+       options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+       if (!options)
+               goto err_mon_dev;
+
+       /* new rbd_device object */
+       rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+       if (!rbd_dev)
+               goto err_out_opt;
+
+       /* static rbd_device initialization */
+       spin_lock_init(&rbd_dev->lock);
+       INIT_LIST_HEAD(&rbd_dev->node);
+
+       /* generate unique id: find highest unique id, add one */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       list_for_each(tmp, &rbd_dev_list) {
+               struct rbd_device *rbd_dev;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id >= new_id)
+                       new_id = rbd_dev->id + 1;
+       }
+
+       rbd_dev->id = new_id;
+
+       /* add to global list */
+       list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+       /* parse add command */
+       if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_OPT_LEN) "s "
+                  "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+                  "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  mon_dev_name, options, rbd_dev->pool_name,
+                  rbd_dev->obj, rbd_dev->snap_name) < 4) {
+               rc = -EINVAL;
+               goto err_out_slot;
+       }
+
+       if (rbd_dev->snap_name[0] == 0)
+               rbd_dev->snap_name[0] = '-';
+
+       rbd_dev->obj_len = strlen(rbd_dev->obj);
+       snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+                rbd_dev->obj, RBD_SUFFIX);
+
+       /* initialize rest of new object */
+       snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+       rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+       if (rc < 0)
+               goto err_out_slot;
+
+       mutex_unlock(&ctl_mutex);
+
+       /* pick the pool */
+       osdc = &rbd_dev->client->osdc;
+       rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+       if (rc < 0)
+               goto err_out_client;
+       rbd_dev->poolid = rc;
+
+       /* register our block device */
+       irc = register_blkdev(0, rbd_dev->name);
+       if (irc < 0) {
+               rc = irc;
+               goto err_out_client;
+       }
+       rbd_dev->major = irc;
+
+       /* set up and announce blkdev mapping */
+       rc = rbd_init_disk(rbd_dev);
+       if (rc)
+               goto err_out_blkdev;
+
+       return count;
+
+err_out_blkdev:
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+       rbd_put_client(rbd_dev);
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+       list_del_init(&rbd_dev->node);
+       mutex_unlock(&ctl_mutex);
+
+       kfree(rbd_dev);
+err_out_opt:
+       kfree(options);
+err_mon_dev:
+       kfree(mon_dev_name);
+err_out_mod:
+       dout("Error adding device %s\n", buf);
+       module_put(THIS_MODULE);
+       return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+       struct list_head *tmp;
+       struct rbd_device *rbd_dev;
+
+       list_for_each(tmp, &rbd_dev_list) {
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               if (rbd_dev->id == id)
+                       return rbd_dev;
+       }
+       return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       /* remove object from list immediately */
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (rbd_dev)
+               list_del_init(&rbd_dev->node);
+
+       mutex_unlock(&ctl_mutex);
+
+       if (!rbd_dev)
+               return -ENOENT;
+
+       rbd_put_client(rbd_dev);
+
+       /* clean up and free blkdev */
+       rbd_free_disk(rbd_dev);
+       unregister_blkdev(rbd_dev->major, rbd_dev->name);
+       kfree(rbd_dev);
+
+       /* release module ref */
+       module_put(THIS_MODULE);
+
+       return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+                             struct class_attribute *attr,
+                             char *data)
+{
+       struct rbd_device *rbd_dev = NULL;
+       struct list_head *tmp;
+       struct rbd_image_header *header;
+       int i, n = 0, max = PAGE_SIZE;
+       int ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+       list_for_each(tmp, &rbd_dev_list) {
+               char *names, *p;
+               struct ceph_snap_context *snapc;
+
+               rbd_dev = list_entry(tmp, struct rbd_device, node);
+               header = &rbd_dev->header;
+
+               down_read(&header->snap_rwsem);
+
+               names = header->snap_names;
+               snapc = header->snapc;
+
+               n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, RBD_SNAP_HEAD_NAME,
+                             header->image_size >> 10,
+                             (!rbd_dev->cur_snap ? " (*)" : ""));
+               if (n == max)
+                       break;
+
+               p = names;
+               for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+                       n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+                             rbd_dev->id, p, header->snap_sizes[i] >> 10,
+                             (rbd_dev->cur_snap &&
+                              (snap_index(header, i) == rbd_dev->cur_snap) ?
+                              " (*)" : ""));
+                       if (n == max)
+                               break;
+               }
+
+               up_read(&header->snap_rwsem);
+       }
+
+
+       ret = n;
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, rc;
+       unsigned long ul;
+       int ret = count;
+
+       rc = strict_strtoul(buf, 10, &ul);
+       if (rc)
+               return rc;
+
+       /* convert to int; abort if we lost anything in the conversion */
+       target_id = (int) ul;
+       if (target_id != ul)
+               return -EINVAL;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done;
+       }
+
+       rc = rbd_update_snaps(rbd_dev);
+       if (rc < 0)
+               ret = rc;
+
+done:
+       mutex_unlock(&ctl_mutex);
+       return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       char *name;
+
+       name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+       if (!name)
+               return -ENOMEM;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  name) != 2) {
+               ret = -EINVAL;
+               goto done;
+       }
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = rbd_header_add_snap(rbd_dev,
+                                 name, GFP_KERNEL);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+done:
+       kfree(name);
+       return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+                               struct class_attribute *attr,
+                               const char *buf,
+                               size_t count)
+{
+       struct rbd_device *rbd_dev = NULL;
+       int target_id, ret;
+       u64 snapid;
+       char snap_name[RBD_MAX_SNAP_NAME_LEN];
+       u64 cur_ofs;
+       char *seg_name;
+
+       /* parse snaps add command */
+       if (sscanf(buf, "%d "
+                  "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+                  &target_id,
+                  snap_name) != 2) {
+               return -EINVAL;
+       }
+
+       ret = -ENOMEM;
+       seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+       if (!seg_name)
+               return ret;
+
+       mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+       rbd_dev = __rbd_get_dev(target_id);
+       if (!rbd_dev) {
+               ret = -ENOENT;
+               goto done_unlock;
+       }
+
+       ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+       if (ret < 0)
+               goto done_unlock;
+
+       dout("snapid=%lld\n", snapid);
+
+       cur_ofs = 0;
+       while (cur_ofs < rbd_dev->header.image_size) {
+               cur_ofs += rbd_get_segment(&rbd_dev->header,
+                                          rbd_dev->obj,
+                                          cur_ofs, (u64)-1,
+                                          seg_name, NULL);
+               dout("seg_name=%s\n", seg_name);
+
+               ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+               if (ret < 0)
+                       pr_warning("could not roll back obj %s err=%d\n",
+                                  seg_name, ret);
+       }
+
+       ret = rbd_update_snaps(rbd_dev);
+       if (ret < 0)
+               goto done_unlock;
+
+       ret = count;
+
+done_unlock:
+       mutex_unlock(&ctl_mutex);
+       kfree(seg_name);
+
+       return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+       __ATTR(add,             0200, NULL, class_rbd_add),
+       __ATTR(remove,          0200, NULL, class_rbd_remove),
+       __ATTR(list,            0444, class_rbd_list, NULL),
+       __ATTR(snaps_refresh,   0200, NULL, class_rbd_snaps_refresh),
+       __ATTR(snap_create,     0200, NULL, class_rbd_snap_create),
+       __ATTR(snaps_list,      0444, class_rbd_snaps_list, NULL),
+       __ATTR(snap_rollback,   0200, NULL, class_rbd_rollback),
+       __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+       int ret = -ENOMEM;
+
+       class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+       if (!class_rbd)
+               goto out;
+
+       class_rbd->name = DRV_NAME;
+       class_rbd->owner = THIS_MODULE;
+       class_rbd->class_release = class_rbd_release;
+       class_rbd->class_attrs = class_rbd_attrs;
+
+       ret = class_register(class_rbd);
+       if (ret)
+               goto out_class;
+       return 0;
+
+out_class:
+       kfree(class_rbd);
+       class_rbd = NULL;
+       pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+       return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+       if (class_rbd)
+               class_destroy(class_rbd);
+       class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+       int rc;
+
+       rc = rbd_sysfs_init();
+       if (rc)
+               return rc;
+       spin_lock_init(&node_lock);
+       pr_info("loaded " DRV_NAME_LONG "\n");
+       return 0;
+}
+
+void __exit rbd_exit(void)
+{
+       rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h

new file mode 100644 (file)

index 0000000..fc6c678
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ *   foo.rbd      - image metadata
+ *   foo.00000000
+ *   foo.00000001
+ *   ...          - data
+ */
+
+#define RBD_SUFFIX             ".rbd"
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER  22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_MAX_OBJ_NAME_LEN   96
+#define RBD_MAX_SEG_NAME_LEN   128
+
+#define RBD_COMP_NONE          0
+#define RBD_CRYPT_NONE         0
+
+#define RBD_HEADER_TEXT                "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE   "RBD"
+#define RBD_HEADER_VERSION     "001.005"
+
+struct rbd_info {
+       __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+       __le64 id;
+       __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+       char text[40];
+       char block_name[24];
+       char signature[4];
+       char version[8];
+       struct {
+               __u8 order;
+               __u8 crypt_type;
+               __u8 comp_type;
+               __u8 unused;
+       } __attribute__((packed)) options;
+       __le64 image_size;
+       __le64 snap_seq;
+       __le32 snap_count;
+       __le32 reserved;
+       __le64 snap_names_len;
+       struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c

index 1101e25..8320490 100644 (file)
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
  #include <linux/spinlock.h>
  #include <linux/slab.h>
  #include <linux/blkdev.h>
-#include <linux/smp_lock.h>
  #include <linux/hdreg.h>
  #include <linux/virtio.h>
  #include <linux/virtio_blk.h>
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
         return err;
  }
  
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
-                        unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+                            unsigned int cmd, unsigned long data)
  {
         struct gendisk *disk = bdev->bd_disk;
         struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
                               (void __user *)data);
  }
  
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
-                            unsigned int cmd, unsigned long param)
-{
-       int ret;
-
-       lock_kernel();
-       ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
-       unlock_kernel();
-
-       return ret;
-}
-
  /* We provide getgeo only to please some old bootloader/partitioning tools */
  static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
  {
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig

index 4b66c69..5ddf67e 100644 (file)
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -57,7 +57,7 @@ config AGP_AMD
  
  config AGP_AMD64
         tristate "AMD Opteron/Athlon64 on-CPU GART support"
-       depends on AGP && X86 && K8_NB
+       depends on AGP && X86 && AMD_NB
         help
           This option gives you AGP support for the GLX component of
           X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs.
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c

index 70312da..42396df 100644 (file)
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -15,7 +15,7 @@
  #include <linux/mmzone.h>
  #include <asm/page.h>          /* PAGE_SIZE */
  #include <asm/e820.h>
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  #include <asm/gart.h>
  #include "agp.h"
  
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
         u32 temp;
         struct aper_size_info_32 *values;
  
-       dev = k8_northbridges[0];
+       dev = k8_northbridges.nb_misc[0];
         if (dev==NULL)
                 return 0;
  
@@ -181,10 +181,14 @@ static int amd_8151_configure(void)
         unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
         int i;
  
+       if (!k8_northbridges.gart_supported)
+               return 0;
+
         /* Configure AGP regs in each x86-64 host bridge. */
-        for (i = 0; i < num_k8_northbridges; i++) {
+       for (i = 0; i < k8_northbridges.num; i++) {
                 agp_bridge->gart_bus_addr =
-                               amd64_configure(k8_northbridges[i], gatt_bus);
+                               amd64_configure(k8_northbridges.nb_misc[i],
+                                               gatt_bus);
         }
         k8_flush_garts();
         return 0;
@@ -195,11 +199,15 @@ static void amd64_cleanup(void)
  {
         u32 tmp;
         int i;
-        for (i = 0; i < num_k8_northbridges; i++) {
-               struct pci_dev *dev = k8_northbridges[i];
+
+       if (!k8_northbridges.gart_supported)
+               return;
+
+       for (i = 0; i < k8_northbridges.num; i++) {
+               struct pci_dev *dev = k8_northbridges.nb_misc[i];
                 /* disable gart translation */
                 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
-               tmp &= ~AMD64_GARTEN;
+               tmp &= ~GARTEN;
                 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
         }
  }
@@ -313,22 +321,25 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
         if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
                 return -1;
  
-       pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1);
+       gart_set_size_and_enable(nb, order);
         pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
  
         return 0;
  }
  
-static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr)
+static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
  {
         int i;
  
         if (cache_k8_northbridges() < 0)
                 return -ENODEV;
  
+       if (!k8_northbridges.gart_supported)
+               return -ENODEV;
+
         i = 0;
-       for (i = 0; i < num_k8_northbridges; i++) {
-               struct pci_dev *dev = k8_northbridges[i];
+       for (i = 0; i < k8_northbridges.num; i++) {
+               struct pci_dev *dev = k8_northbridges.nb_misc[i];
                 if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
                         dev_err(&dev->dev, "no usable aperture found\n");
  #ifdef __x86_64__
@@ -405,7 +416,8 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
         }
  
         /* shadow x86-64 registers into ULi registers */
-       pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &httfea);
+       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+                              &httfea);
  
         /* if x86-64 aperture base is beyond 4G, exit here */
         if ((httfea & 0x7fff) >> (32 - 25)) {
@@ -472,7 +484,8 @@ static int nforce3_agp_init(struct pci_dev *pdev)
         pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
  
         /* shadow x86-64 registers into NVIDIA registers */
-       pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &apbase);
+       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+                              &apbase);
  
         /* if x86-64 aperture base is beyond 4G, exit here */
         if ( (apbase & 0x7fff) >> (32 - 25) ) {
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c

index d2abf51..64255ce 100644 (file)
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
  
         bridge->driver->cache_flush();
  #ifdef CONFIG_X86
-       set_memory_uc((unsigned long)table, 1 << page_order);
+       if (set_memory_uc((unsigned long)table, 1 << page_order))
+               printk(KERN_WARNING "Could not set GATT table memory to UC!");
+
         bridge->gatt_table = (void *)table;
  #else
         bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c

index 05ad4a1..7c41335 100644 (file)
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
  #define TPM_MAX_PROTECTED_ORDINAL 12
  #define TPM_PROTECTED_ORDINAL_MASK 0xFF
  
+/*
+ * Bug workaround - some TPM's don't flush the most
+ * recently changed pcr on suspend, so force the flush
+ * with an extend to the selected _unused_ non-volatile pcr.
+ */
+static int tpm_suspend_pcr;
+module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
+MODULE_PARM_DESC(suspend_pcr,
+                "PCR to use for dummy writes to faciltate flush on suspend.");
+
  static LIST_HEAD(tpm_chip_list);
  static DEFINE_SPINLOCK(driver_lock);
  static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
         .ordinal = TPM_ORD_SAVESTATE
  };
  
-/* Bug workaround - some TPM's don't flush the most
- * recently changed pcr on suspend, so force the flush
- * with an extend to the selected _unused_ non-volatile pcr.
- */
-static int tpm_suspend_pcr;
-static int __init tpm_suspend_setup(char *str)
-{
-       get_option(&str, &tpm_suspend_pcr);
-       return 1;
-}
-__setup("tpm_suspend_pcr=", tpm_suspend_setup);
-
  /*
   * We are about to suspend. Save the TPM state
   * so that it can be restored.
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c

index 0f69c5e..6c1b676 100644 (file)
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
         /* Used for exporting per-port information to debugfs */
         struct dentry *debugfs_dir;
  
+       /* List of all the devices we're handling */
+       struct list_head portdevs;
+
         /* Number of devices this driver is handling */
         unsigned int index;
  
@@ -108,6 +111,9 @@ struct port_buffer {
   * ports for that device (vdev->priv).
   */
  struct ports_device {
+       /* Next portdev in the list, head is in the pdrvdata struct */
+       struct list_head list;
+
         /*
          * Workqueue handlers where we process deferred work after
          * notification
@@ -178,15 +184,21 @@ struct port {
         struct console cons;
  
         /* Each port associates with a separate char device */
-       struct cdev cdev;
+       struct cdev *cdev;
         struct device *dev;
  
+       /* Reference-counting to handle port hot-unplugs and file operations */
+       struct kref kref;
+
         /* A waitqueue for poll() or blocking read operations */
         wait_queue_head_t waitqueue;
  
         /* The 'name' of the port that we expose via sysfs properties */
         char *name;
  
+       /* We can notify apps of host connect / disconnect events via SIGIO */
+       struct fasync_struct *async_queue;
+
         /* The 'id' to identify the port with the Host */
         u32 id;
  
@@ -221,6 +233,41 @@ out:
         return port;
  }
  
+static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
+                                                dev_t dev)
+{
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&portdev->ports_lock, flags);
+       list_for_each_entry(port, &portdev->ports, list)
+               if (port->cdev->dev == dev)
+                       goto out;
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&portdev->ports_lock, flags);
+
+       return port;
+}
+
+static struct port *find_port_by_devt(dev_t dev)
+{
+       struct ports_device *portdev;
+       struct port *port;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pdrvdata_lock, flags);
+       list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
+               port = find_port_by_devt_in_portdev(portdev, dev);
+               if (port)
+                       goto out;
+       }
+       port = NULL;
+out:
+       spin_unlock_irqrestore(&pdrvdata_lock, flags);
+       return port;
+}
+
  static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
  {
         struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
  static ssize_t send_control_msg(struct port *port, unsigned int event,
                                 unsigned int value)
  {
-       return __send_control_msg(port->portdev, port->id, event, value);
+       /* Did the port get unplugged before userspace closed it? */
+       if (port->portdev)
+               return __send_control_msg(port->portdev, port->id, event, value);
+       return 0;
  }
  
  /* Callers must take the port->outvq_lock */
@@ -525,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
  /* The condition that must be true for polling to end */
  static bool will_read_block(struct port *port)
  {
+       if (!port->guest_connected) {
+               /* Port got hot-unplugged. Let's exit. */
+               return false;
+       }
         return !port_has_data(port) && port->host_connected;
  }
  
@@ -575,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
                 if (ret < 0)
                         return ret;
         }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
         /*
          * We could've received a disconnection message while we were
          * waiting for more data.
@@ -616,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
                 if (ret < 0)
                         return ret;
         }
+       /* Port got hot-unplugged. */
+       if (!port->guest_connected)
+               return -ENODEV;
  
         count = min((size_t)(32 * 1024), count);
  
@@ -656,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
         port = filp->private_data;
         poll_wait(filp, &port->waitqueue, wait);
  
+       if (!port->guest_connected) {
+               /* Port got unplugged */
+               return POLLHUP;
+       }
         ret = 0;
         if (!will_read_block(port))
                 ret |= POLLIN | POLLRDNORM;
@@ -667,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
         return ret;
  }
  
+static void remove_port(struct kref *kref);
+
  static int port_fops_release(struct inode *inode, struct file *filp)
  {
         struct port *port;
@@ -687,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
         reclaim_consumed_buffers(port);
         spin_unlock_irq(&port->outvq_lock);
  
+       /*
+        * Locks aren't necessary here as a port can't be opened after
+        * unplug, and if a port isn't unplugged, a kref would already
+        * exist for the port.  Plus, taking ports_lock here would
+        * create a dependency on other locks taken by functions
+        * inside remove_port if we're the last holder of the port,
+        * creating many problems.
+        */
+       kref_put(&port->kref, remove_port);
+
         return 0;
  }
  
@@ -694,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
  {
         struct cdev *cdev = inode->i_cdev;
         struct port *port;
+       int ret;
  
-       port = container_of(cdev, struct port, cdev);
+       port = find_port_by_devt(cdev->dev);
         filp->private_data = port;
  
+       /* Prevent against a port getting hot-unplugged at the same time */
+       spin_lock_irq(&port->portdev->ports_lock);
+       kref_get(&port->kref);
+       spin_unlock_irq(&port->portdev->ports_lock);
+
         /*
          * Don't allow opening of console port devices -- that's done
          * via /dev/hvc
          */
-       if (is_console_port(port))
-               return -ENXIO;
+       if (is_console_port(port)) {
+               ret = -ENXIO;
+               goto out;
+       }
  
         /* Allow only one process to open a particular port at a time */
         spin_lock_irq(&port->inbuf_lock);
         if (port->guest_connected) {
                 spin_unlock_irq(&port->inbuf_lock);
-               return -EMFILE;
+               ret = -EMFILE;
+               goto out;
         }
  
         port->guest_connected = true;
@@ -724,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
         reclaim_consumed_buffers(port);
         spin_unlock_irq(&port->outvq_lock);
  
+       nonseekable_open(inode, filp);
+
         /* Notify host of port being opened */
         send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
  
         return 0;
+out:
+       kref_put(&port->kref, remove_port);
+       return ret;
+}
+
+static int port_fops_fasync(int fd, struct file *filp, int mode)
+{
+       struct port *port;
+
+       port = filp->private_data;
+       return fasync_helper(fd, filp, mode, &port->async_queue);
  }
  
  /*
@@ -743,6 +841,8 @@ static const struct file_operations port_fops = {
         .write = port_fops_write,
         .poll  = port_fops_poll,
         .release = port_fops_release,
+       .fasync = port_fops_fasync,
+       .llseek = no_llseek,
  };
  
  /*
@@ -1001,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
         return nr_added_bufs;
  }
  
+static void send_sigio_to_port(struct port *port)
+{
+       if (port->async_queue && port->guest_connected)
+               kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
+}
+
  static int add_port(struct ports_device *portdev, u32 id)
  {
         char debugfs_name[16];
@@ -1015,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
                 err = -ENOMEM;
                 goto fail;
         }
+       kref_init(&port->kref);
  
         port->portdev = portdev;
         port->id = id;
@@ -1022,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
         port->name = NULL;
         port->inbuf = NULL;
         port->cons.hvc = NULL;
+       port->async_queue = NULL;
  
         port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
  
@@ -1032,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
         port->in_vq = portdev->in_vqs[port->id];
         port->out_vq = portdev->out_vqs[port->id];
  
-       cdev_init(&port->cdev, &port_fops);
+       port->cdev = cdev_alloc();
+       if (!port->cdev) {
+               dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
+               err = -ENOMEM;
+               goto free_port;
+       }
+       port->cdev->ops = &port_fops;
  
         devt = MKDEV(portdev->chr_major, id);
-       err = cdev_add(&port->cdev, devt, 1);
+       err = cdev_add(port->cdev, devt, 1);
         if (err < 0) {
                 dev_err(&port->portdev->vdev->dev,
                         "Error %d adding cdev for port %u\n", err, id);
-               goto free_port;
+               goto free_cdev;
         }
         port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
                                   devt, port, "vport%up%u",
@@ -1104,7 +1218,7 @@ free_inbufs:
  free_device:
         device_destroy(pdrvdata.class, port->dev->devt);
  free_cdev:
-       cdev_del(&port->cdev);
+       cdev_del(port->cdev);
  free_port:
         kfree(port);
  fail:
@@ -1113,21 +1227,45 @@ fail:
         return err;
  }
  
-/* Remove all port-specific data. */
-static int remove_port(struct port *port)
+/* No users remain, remove all port-specific data. */
+static void remove_port(struct kref *kref)
+{
+       struct port *port;
+
+       port = container_of(kref, struct port, kref);
+
+       sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
+       device_destroy(pdrvdata.class, port->dev->devt);
+       cdev_del(port->cdev);
+
+       kfree(port->name);
+
+       debugfs_remove(port->debugfs_file);
+
+       kfree(port);
+}
+
+/*
+ * Port got unplugged.  Remove port from portdev's list and drop the
+ * kref reference.  If no userspace has this port opened, it will
+ * result in immediate removal the port.
+ */
+static void unplug_port(struct port *port)
  {
         struct port_buffer *buf;
  
+       spin_lock_irq(&port->portdev->ports_lock);
+       list_del(&port->list);
+       spin_unlock_irq(&port->portdev->ports_lock);
+
         if (port->guest_connected) {
                 port->guest_connected = false;
                 port->host_connected = false;
                 wake_up_interruptible(&port->waitqueue);
-               send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
-       }
  
-       spin_lock_irq(&port->portdev->ports_lock);
-       list_del(&port->list);
-       spin_unlock_irq(&port->portdev->ports_lock);
+               /* Let the app know the port is going down. */
+               send_sigio_to_port(port);
+       }
  
         if (is_console_port(port)) {
                 spin_lock_irq(&pdrvdata_lock);
@@ -1146,9 +1284,6 @@ static int remove_port(struct port *port)
                 hvc_remove(port->cons.hvc);
  #endif
         }
-       sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
-       device_destroy(pdrvdata.class, port->dev->devt);
-       cdev_del(&port->cdev);
  
         /* Remove unused data this port might have received. */
         discard_port_data(port);
@@ -1159,12 +1294,19 @@ static int remove_port(struct port *port)
         while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
                 free_buf(buf);
  
-       kfree(port->name);
-
-       debugfs_remove(port->debugfs_file);
+       /*
+        * We should just assume the device itself has gone off --
+        * else a close on an open port later will try to send out a
+        * control message.
+        */
+       port->portdev = NULL;
  
-       kfree(port);
-       return 0;
+       /*
+        * Locks around here are not necessary - a port can't be
+        * opened after we removed the port struct from ports_list
+        * above.
+        */
+       kref_put(&port->kref, remove_port);
  }
  
  /* Any private messages that the Host and Guest want to share */
@@ -1203,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
                 add_port(portdev, cpkt->id);
                 break;
         case VIRTIO_CONSOLE_PORT_REMOVE:
-               remove_port(port);
+               unplug_port(port);
                 break;
         case VIRTIO_CONSOLE_CONSOLE_PORT:
                 if (!cpkt->value)
@@ -1245,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
                 spin_lock_irq(&port->outvq_lock);
                 reclaim_consumed_buffers(port);
                 spin_unlock_irq(&port->outvq_lock);
+
+               /*
+                * If the guest is connected, it'll be interested in
+                * knowing the host connection state changed.
+                */
+               send_sigio_to_port(port);
                 break;
         case VIRTIO_CONSOLE_PORT_NAME:
                 /*
@@ -1341,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
  
         wake_up_interruptible(&port->waitqueue);
  
+       /* Send a SIGIO indicating new data in case the process asked for it */
+       send_sigio_to_port(port);
+
         if (is_console_port(port) && hvc_poll(port->cons.hvc))
                 hvc_kick();
  }
@@ -1577,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
                 add_port(portdev, 0);
         }
  
+       spin_lock_irq(&pdrvdata_lock);
+       list_add_tail(&portdev->list, &pdrvdata.portdevs);
+       spin_unlock_irq(&pdrvdata_lock);
+
         __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
                            VIRTIO_CONSOLE_DEVICE_READY, 1);
         return 0;
@@ -1600,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
  {
         struct ports_device *portdev;
         struct port *port, *port2;
-       struct port_buffer *buf;
-       unsigned int len;
  
         portdev = vdev->priv;
  
+       spin_lock_irq(&pdrvdata_lock);
+       list_del(&portdev->list);
+       spin_unlock_irq(&pdrvdata_lock);
+
+       /* Disable interrupts for vqs */
+       vdev->config->reset(vdev);
+       /* Finish up work that's lined up */
         cancel_work_sync(&portdev->control_work);
  
         list_for_each_entry_safe(port, port2, &portdev->ports, list)
-               remove_port(port);
+               unplug_port(port);
  
         unregister_chrdev(portdev->chr_major, "virtio-portsdev");
  
-       while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
-               free_buf(buf);
+       /*
+        * When yanking out a device, we immediately lose the
+        * (device-side) queues.  So there's no point in keeping the
+        * guest side around till we drop our final reference.  This
+        * also means that any ports which are in an open state will
+        * have to just stop using the port, as the vqs are going
+        * away.
+        */
+       if (use_multiport(portdev)) {
+               struct port_buffer *buf;
+               unsigned int len;
  
-       while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
-               free_buf(buf);
+               while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
+                       free_buf(buf);
+
+               while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
+                       free_buf(buf);
+       }
  
         vdev->config->del_vqs(vdev);
         kfree(portdev->in_vqs);
@@ -1663,6 +1836,7 @@ static int __init init(void)
                            PTR_ERR(pdrvdata.debugfs_dir));
         }
         INIT_LIST_HEAD(&pdrvdata.consoles);
+       INIT_LIST_HEAD(&pdrvdata.portdevs);
  
         return register_virtio_driver(&virtio_console);
  }
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig

index 70bb350..9dbb28b 100644 (file)
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -39,7 +39,7 @@ config EDAC_DEBUG
           there're four debug levels (x=0,1,2,3 from low to high).
           Usually you should select 'N'.
  
- config EDAC_DECODE_MCE
+config EDAC_DECODE_MCE
         tristate "Decode MCEs in human-readable form (only on AMD for now)"
         depends on CPU_SUP_AMD && X86_MCE
         default y
@@ -51,6 +51,16 @@ config EDAC_DEBUG
           which occur really early upon boot, before the module infrastructure
           has been initialized.
  
+config EDAC_MCE_INJ
+       tristate "Simple MCE injection interface over /sysfs"
+       depends on EDAC_DECODE_MCE
+       default n
+       help
+         This is a simple interface to inject MCEs over /sysfs and test
+         the MCE decoding code in EDAC.
+
+         This is currently AMD-only.
+
  config EDAC_MM_EDAC
         tristate "Main Memory EDAC (Error Detection And Correction) reporting"
         help
@@ -66,13 +76,13 @@ config EDAC_MCE
  
  config EDAC_AMD64
         tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
-       depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && EDAC_DECODE_MCE
+       depends on EDAC_MM_EDAC && AMD_NB && X86_64 && PCI && EDAC_DECODE_MCE
         help
           Support for error detection and correction on the AMD 64
           Families of Memory Controllers (K8, F10h and F11h)
  
  config EDAC_AMD64_ERROR_INJECTION
-       bool "Sysfs Error Injection facilities"
+       bool "Sysfs HW Error injection facilities"
         depends on EDAC_AMD64
         help
           Recent Opterons (Family 10h and later) provide for Memory Error
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile

index ca6b1bb..32c7bc9 100644 (file)
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -17,6 +17,9 @@ ifdef CONFIG_PCI
  edac_core-objs += edac_pci.o edac_pci_sysfs.o
  endif
  
+obj-$(CONFIG_EDAC_MCE_INJ)             += mce_amd_inj.o
+
+edac_mce_amd-objs                      := mce_amd.o
  obj-$(CONFIG_EDAC_DECODE_MCE)          += edac_mce_amd.o
  
  obj-$(CONFIG_EDAC_AMD76X)              += amd76x_edac.o
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c

index e7d5d6b..8521401 100644 (file)
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,5 +1,5 @@
  #include "amd64_edac.h"
-#include <asm/k8.h>
+#include <asm/amd_nb.h>
  
  static struct edac_pci_ctl_info *amd64_ctl_pci;
  
@@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
                 amd64_handle_ue(mci, info);
  }
  
-void amd64_decode_bus_error(int node_id, struct err_regs *regs)
+void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
  {
         struct mem_ctl_info *mci = mci_lookup[node_id];
+       struct err_regs regs;
  
-       __amd64_decode_bus_error(mci, regs);
+       regs.nbsl  = (u32) m->status;
+       regs.nbsh  = (u32)(m->status >> 32);
+       regs.nbeal = (u32) m->addr;
+       regs.nbeah = (u32)(m->addr >> 32);
+       regs.nbcfg = nbcfg;
+
+       __amd64_decode_bus_error(mci, &regs);
  
         /*
          * Check the UE bit of the NB status high register, if set generate some
@@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
          *
          * FIXME: this should go somewhere else, if at all.
          */
-       if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+       if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
                 edac_mc_handle_ue_no_info(mci, "UE bit is set");
  
  }
@@ -2927,7 +2934,7 @@ static int __init amd64_edac_init(void)
          * to finish initialization of the MC instances.
          */
         err = -ENODEV;
-       for (nb = 0; nb < num_k8_northbridges; nb++) {
+       for (nb = 0; nb < k8_northbridges.num; nb++) {
                 if (!pvt_lookup[nb])
                         continue;
  
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h

index 613b938..044aee4 100644 (file)
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -72,7 +72,7 @@
  #include <linux/edac.h>
  #include <asm/msr.h>
  #include "edac_core.h"
-#include "edac_mce_amd.h"
+#include "mce_amd.h"
  
  #define amd64_printk(level, fmt, arg...) \
         edac_printk(level, "amd64", fmt, ##arg)
@@ -482,11 +482,10 @@ extern const char *rrrr_msgs[16];
  extern const char *to_msgs[2];
  extern const char *pp_msgs[4];
  extern const char *ii_msgs[4];
-extern const char *ext_msgs[32];
  extern const char *htlink_msgs[8];
  
  #ifdef CONFIG_EDAC_DEBUG
-#define NUM_DBG_ATTRS 9
+#define NUM_DBG_ATTRS 5
  #else
  #define NUM_DBG_ATTRS 0
  #endif
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c

index 59cf2cf..e356228 100644 (file)
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -1,167 +1,16 @@
  #include "amd64_edac.h"
  
-/*
- * accept a hex value and store it into the virtual error register file, field:
- * nbeal and nbeah. Assume virtual error values have already been set for: NBSL,
- * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and
- * CHANNEL
- */
-static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
-                               size_t count)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       unsigned long long value;
-       int ret = 0;
-
-       ret = strict_strtoull(data, 16, &value);
-       if (ret != -EINVAL) {
-               debugf0("received NBEA= 0x%llx\n", value);
-
-               /* place the value into the virtual error packet */
-               pvt->ctl_error_info.nbeal = (u32) value;
-               value >>= 32;
-               pvt->ctl_error_info.nbeah = (u32) value;
-
-               /* Process the Mapping request */
-               /* TODO: Add race prevention */
-               amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
-
-               return count;
-       }
-       return ret;
-}
-
-/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
-static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       u64 value;
-
-       value = pvt->ctl_error_info.nbeah;
-       value <<= 32;
-       value |= pvt->ctl_error_info.nbeal;
-
-       return sprintf(data, "%llx\n", value);
-}
-
-/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
-static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
-                               size_t count)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       unsigned long value;
-       int ret = 0;
-
-       ret = strict_strtoul(data, 16, &value);
-       if (ret != -EINVAL) {
-               debugf0("received NBSL= 0x%lx\n", value);
-
-               pvt->ctl_error_info.nbsl = (u32) value;
-
-               return count;
-       }
-       return ret;
-}
-
-/* display back what the last NBSL value written */
-static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       u32 value;
-
-       value = pvt->ctl_error_info.nbsl;
-
-       return sprintf(data, "%x\n", value);
-}
-
-/* store the NBSH (MCA NB Status High) value user desires */
-static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
-                               size_t count)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       unsigned long value;
-       int ret = 0;
-
-       ret = strict_strtoul(data, 16, &value);
-       if (ret != -EINVAL) {
-               debugf0("received NBSH= 0x%lx\n", value);
-
-               pvt->ctl_error_info.nbsh = (u32) value;
-
-               return count;
-       }
-       return ret;
-}
-
-/* display back what the last NBSH value written */
-static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       u32 value;
-
-       value = pvt->ctl_error_info.nbsh;
-
-       return sprintf(data, "%x\n", value);
+#define EDAC_DCT_ATTR_SHOW(reg)                                                \
+static ssize_t amd64_##reg##_show(struct mem_ctl_info *mci, char *data)        \
+{                                                                      \
+       struct amd64_pvt *pvt = mci->pvt_info;                          \
+               return sprintf(data, "0x%016llx\n", (u64)pvt->reg);     \
  }
  
-/* accept and store the NBCFG (MCA NB Configuration) value user desires */
-static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci,
-                                       const char *data, size_t count)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-       unsigned long value;
-       int ret = 0;
-
-       ret = strict_strtoul(data, 16, &value);
-       if (ret != -EINVAL) {
-               debugf0("received NBCFG= 0x%lx\n", value);
-
-               pvt->ctl_error_info.nbcfg = (u32) value;
-
-               return count;
-       }
-       return ret;
-}
-
-/* various show routines for the controls of a MCI */
-static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-
-       return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
-}
-
-
-static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-
-       return sprintf(data, "%x\n", pvt->dhar);
-}
-
-
-static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-
-       return sprintf(data, "%x\n", pvt->dbam0);
-}
-
-
-static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-
-       return sprintf(data, "%llx\n", pvt->top_mem);
-}
-
-
-static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
-{
-       struct amd64_pvt *pvt = mci->pvt_info;
-
-       return sprintf(data, "%llx\n", pvt->top_mem2);
-}
+EDAC_DCT_ATTR_SHOW(dhar);
+EDAC_DCT_ATTR_SHOW(dbam0);
+EDAC_DCT_ATTR_SHOW(top_mem);
+EDAC_DCT_ATTR_SHOW(top_mem2);
  
  static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
  {
@@ -180,38 +29,6 @@ static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
   */
  struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
  
-       {
-               .attr = {
-                       .name = "nbea_ctl",
-                       .mode = (S_IRUGO | S_IWUSR)
-               },
-               .show = amd64_nbea_show,
-               .store = amd64_nbea_store,
-       },
-       {
-               .attr = {
-                       .name = "nbsl_ctl",
-                       .mode = (S_IRUGO | S_IWUSR)
-               },
-               .show = amd64_nbsl_show,
-               .store = amd64_nbsl_store,
-       },
-       {
-               .attr = {
-                       .name = "nbsh_ctl",
-                       .mode = (S_IRUGO | S_IWUSR)
-               },
-               .show = amd64_nbsh_show,
-               .store = amd64_nbsh_store,
-       },
-       {
-               .attr = {
-                       .name = "nbcfg_ctl",
-                       .mode = (S_IRUGO | S_IWUSR)
-               },
-               .show = amd64_nbcfg_show,
-               .store = amd64_nbcfg_store,
-       },
         {
                 .attr = {
                         .name = "dhar",
@@ -225,7 +42,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
                         .name = "dbam",
                         .mode = (S_IRUGO)
                 },
-               .show = amd64_dbam_show,
+               .show = amd64_dbam0_show,
                 .store = NULL,
         },
         {
@@ -233,7 +50,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
                         .name = "topmem",
                         .mode = (S_IRUGO)
                 },
-               .show = amd64_topmem_show,
+               .show = amd64_top_mem_show,
                 .store = NULL,
         },
         {
@@ -241,7 +58,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
                         .name = "topmem2",
                         .mode = (S_IRUGO)
                 },
-               .show = amd64_topmem2_show,
+               .show = amd64_top_mem2_show,
                 .store = NULL,
         },
         {
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c

index 0709681..2941dca 100644 (file)
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -13,6 +13,7 @@
  #include <linux/ctype.h>
  #include <linux/module.h>
  #include <linux/slab.h>
+#include <linux/edac.h>
  
  #include "edac_core.h"
  #include "edac_module.h"
@@ -235,7 +236,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
         debugf1("%s()\n", __func__);
  
         /* get the /sys/devices/system/edac reference */
-       edac_class = edac_get_edac_class();
+       edac_class = edac_get_sysfs_class();
         if (edac_class == NULL) {
                 debugf1("%s() no edac_class error\n", __func__);
                 err = -ENODEV;
@@ -255,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
  
         if (!try_module_get(edac_dev->owner)) {
                 err = -ENODEV;
-               goto err_out;
+               goto err_mod_get;
         }
  
         /* register */
@@ -282,6 +283,9 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
  err_kobj_reg:
         module_put(edac_dev->owner);
  
+err_mod_get:
+       edac_put_sysfs_class();
+
  err_out:
         return err;
  }
@@ -290,12 +294,11 @@ err_out:
   * edac_device_unregister_sysfs_main_kobj:
   *     the '..../edac/<name>' kobject
   */
-void edac_device_unregister_sysfs_main_kobj(
-                                       struct edac_device_ctl_info *edac_dev)
+void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev)
  {
         debugf0("%s()\n", __func__);
         debugf4("%s() name of kobject is: %s\n",
-               __func__, kobject_name(&edac_dev->kobj));
+               __func__, kobject_name(&dev->kobj));
  
         /*
          * Unregister the edac device's kobject and
@@ -304,7 +307,8 @@ void edac_device_unregister_sysfs_main_kobj(
          *   a) module_put() this module
          *   b) 'kfree' the memory
          */
-       kobject_put(&edac_dev->kobj);
+       kobject_put(&dev->kobj);
+       edac_put_sysfs_class();
  }
  
  /* edac_dev -> instance information */
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c

index 8aad94d..a413586 100644 (file)
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -11,6 +11,7 @@
  
  #include <linux/ctype.h>
  #include <linux/slab.h>
+#include <linux/edac.h>
  #include <linux/bug.h>
  
  #include "edac_core.h"
@@ -1011,13 +1012,13 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
   */
  int edac_sysfs_setup_mc_kset(void)
  {
-       int err = 0;
+       int err = -EINVAL;
         struct sysdev_class *edac_class;
  
         debugf1("%s()\n", __func__);
  
         /* get the /sys/devices/system/edac class reference */
-       edac_class = edac_get_edac_class();
+       edac_class = edac_get_sysfs_class();
         if (edac_class == NULL) {
                 debugf1("%s() no edac_class error=%d\n", __func__, err);
                 goto fail_out;
@@ -1028,15 +1029,16 @@ int edac_sysfs_setup_mc_kset(void)
         if (!mc_kset) {
                 err = -ENOMEM;
                 debugf1("%s() Failed to register '.../edac/mc'\n", __func__);
-               goto fail_out;
+               goto fail_kset;
         }
  
         debugf1("%s() Registered '.../edac/mc' kobject\n", __func__);
  
         return 0;
  
+fail_kset:
+       edac_put_sysfs_class();
  
-       /* error unwind stack */
  fail_out:
         return err;
  }
@@ -1049,5 +1051,6 @@ fail_out:
  void edac_sysfs_teardown_mc_kset(void)
  {
         kset_unregister(mc_kset);
+       edac_put_sysfs_class();
  }
  
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c

deleted file mode 100644 (file)

index 9014df6..0000000
--- a/drivers/edac/edac_mce_amd.c
+++ /dev/null
@@ -1,452 +0,0 @@
-#include <linux/module.h>
-#include "edac_mce_amd.h"
-
-static bool report_gart_errors;
-static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
-
-void amd_report_gart_errors(bool v)
-{
-       report_gart_errors = v;
-}
-EXPORT_SYMBOL_GPL(amd_report_gart_errors);
-
-void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
-{
-       nb_bus_decoder = f;
-}
-EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
-
-void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
-{
-       if (nb_bus_decoder) {
-               WARN_ON(nb_bus_decoder != f);
-
-               nb_bus_decoder = NULL;
-       }
-}
-EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
-
-/*
- * string representation for the different MCA reported error types, see F3x48
- * or MSR0000_0411.
- */
-const char *tt_msgs[] = {        /* transaction type */
-       "instruction",
-       "data",
-       "generic",
-       "reserved"
-};
-EXPORT_SYMBOL_GPL(tt_msgs);
-
-const char *ll_msgs[] = {      /* cache level */
-       "L0",
-       "L1",
-       "L2",
-       "L3/generic"
-};
-EXPORT_SYMBOL_GPL(ll_msgs);
-
-const char *rrrr_msgs[] = {
-       "generic",
-       "generic read",
-       "generic write",
-       "data read",
-       "data write",
-       "inst fetch",
-       "prefetch",
-       "evict",
-       "snoop",
-       "reserved RRRR= 9",
-       "reserved RRRR= 10",
-       "reserved RRRR= 11",
-       "reserved RRRR= 12",
-       "reserved RRRR= 13",
-       "reserved RRRR= 14",
-       "reserved RRRR= 15"
-};
-EXPORT_SYMBOL_GPL(rrrr_msgs);
-
-const char *pp_msgs[] = {      /* participating processor */
-       "local node originated (SRC)",
-       "local node responded to request (RES)",
-       "local node observed as 3rd party (OBS)",
-       "generic"
-};
-EXPORT_SYMBOL_GPL(pp_msgs);
-
-const char *to_msgs[] = {
-       "no timeout",
-       "timed out"
-};
-EXPORT_SYMBOL_GPL(to_msgs);
-
-const char *ii_msgs[] = {      /* memory or i/o */
-       "mem access",
-       "reserved",
-       "i/o access",
-       "generic"
-};
-EXPORT_SYMBOL_GPL(ii_msgs);
-
-/*
- * Map the 4 or 5 (family-specific) bits of Extended Error code to the
- * string table.
- */
-const char *ext_msgs[] = {
-       "K8 ECC error",                                 /* 0_0000b */
-       "CRC error on link",                            /* 0_0001b */
-       "Sync error packets on link",                   /* 0_0010b */
-       "Master Abort during link operation",           /* 0_0011b */
-       "Target Abort during link operation",           /* 0_0100b */
-       "Invalid GART PTE entry during table walk",     /* 0_0101b */
-       "Unsupported atomic RMW command received",      /* 0_0110b */
-       "WDT error: NB transaction timeout",            /* 0_0111b */
-       "ECC/ChipKill ECC error",                       /* 0_1000b */
-       "SVM DEV Error",                                /* 0_1001b */
-       "Link Data error",                              /* 0_1010b */
-       "Link/L3/Probe Filter Protocol error",          /* 0_1011b */
-       "NB Internal Arrays Parity error",              /* 0_1100b */
-       "DRAM Address/Control Parity error",            /* 0_1101b */
-       "Link Transmission error",                      /* 0_1110b */
-       "GART/DEV Table Walk Data error"                /* 0_1111b */
-       "Res 0x100 error",                              /* 1_0000b */
-       "Res 0x101 error",                              /* 1_0001b */
-       "Res 0x102 error",                              /* 1_0010b */
-       "Res 0x103 error",                              /* 1_0011b */
-       "Res 0x104 error",                              /* 1_0100b */
-       "Res 0x105 error",                              /* 1_0101b */
-       "Res 0x106 error",                              /* 1_0110b */
-       "Res 0x107 error",                              /* 1_0111b */
-       "Res 0x108 error",                              /* 1_1000b */
-       "Res 0x109 error",                              /* 1_1001b */
-       "Res 0x10A error",                              /* 1_1010b */
-       "Res 0x10B error",                              /* 1_1011b */
-       "ECC error in L3 Cache Data",                   /* 1_1100b */
-       "L3 Cache Tag error",                           /* 1_1101b */
-       "L3 Cache LRU Parity error",                    /* 1_1110b */
-       "Probe Filter error"                            /* 1_1111b */
-};
-EXPORT_SYMBOL_GPL(ext_msgs);
-
-static void amd_decode_dc_mce(u64 mc0_status)
-{
-       u32 ec  = mc0_status & 0xffff;
-       u32 xec = (mc0_status >> 16) & 0xf;
-
-       pr_emerg("Data Cache Error");
-
-       if (xec == 1 && TLB_ERROR(ec))
-               pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
-       else if (xec == 0) {
-               if (mc0_status & (1ULL << 40))
-                       pr_cont(" during Data Scrub.\n");
-               else if (TLB_ERROR(ec))
-                       pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
-               else if (MEM_ERROR(ec)) {
-                       u8 ll   = ec & 0x3;
-                       u8 tt   = (ec >> 2) & 0x3;
-                       u8 rrrr = (ec >> 4) & 0xf;
-
-                       /* see F10h BKDG (31116), Table 92. */
-                       if (ll == 0x1) {
-                               if (tt != 0x1)
-                                       goto wrong_dc_mce;
-
-                               pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
-
-                       } else if (ll == 0x2 && rrrr == 0x3)
-                               pr_cont(" during L1 linefill from L2.\n");
-                       else
-                               goto wrong_dc_mce;
-               } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
-                       pr_cont(" during system linefill.\n");
-               else
-                       goto wrong_dc_mce;
-       } else
-               goto wrong_dc_mce;
-
-       return;
-
-wrong_dc_mce:
-       pr_warning("Corrupted DC MCE info?\n");
-}
-
-static void amd_decode_ic_mce(u64 mc1_status)
-{
-       u32 ec  = mc1_status & 0xffff;
-       u32 xec = (mc1_status >> 16) & 0xf;
-
-       pr_emerg("Instruction Cache Error");
-
-       if (xec == 1 && TLB_ERROR(ec))
-               pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
-       else if (xec == 0) {
-               if (TLB_ERROR(ec))
-                       pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
-               else if (BUS_ERROR(ec)) {
-                       if (boot_cpu_data.x86 == 0xf &&
-                           (mc1_status & (1ULL << 58)))
-                               pr_cont(" during system linefill.\n");
-                       else
-                               pr_cont(" during attempted NB data read.\n");
-               } else if (MEM_ERROR(ec)) {
-                       u8 ll   = ec & 0x3;
-                       u8 rrrr = (ec >> 4) & 0xf;
-
-                       if (ll == 0x2)
-                               pr_cont(" during a linefill from L2.\n");
-                       else if (ll == 0x1) {
-
-                               switch (rrrr) {
-                               case 0x5:
-                                       pr_cont(": Parity error during "
-                                              "data load.\n");
-                                       break;
-
-                               case 0x7:
-                                       pr_cont(": Copyback Parity/Victim"
-                                               " error.\n");
-                                       break;
-
-                               case 0x8:
-                                       pr_cont(": Tag Snoop error.\n");
-                                       break;
-
-                               default:
-                                       goto wrong_ic_mce;
-                                       break;
-                               }
-                       }
-               } else
-                       goto wrong_ic_mce;
-       } else
-               goto wrong_ic_mce;
-
-       return;
-
-wrong_ic_mce:
-       pr_warning("Corrupted IC MCE info?\n");
-}
-
-static void amd_decode_bu_mce(u64 mc2_status)
-{
-       u32 ec = mc2_status & 0xffff;
-       u32 xec = (mc2_status >> 16) & 0xf;
-
-       pr_emerg("Bus Unit Error");
-
-       if (xec == 0x1)
-               pr_cont(" in the write data buffers.\n");
-       else if (xec == 0x3)
-               pr_cont(" in the victim data buffers.\n");
-       else if (xec == 0x2 && MEM_ERROR(ec))
-               pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
-       else if (xec == 0x0) {
-               if (TLB_ERROR(ec))
-                       pr_cont(": %s error in a Page Descriptor Cache or "
-                               "Guest TLB.\n", TT_MSG(ec));
-               else if (BUS_ERROR(ec))
-                       pr_cont(": %s/ECC error in data read from NB: %s.\n",
-                               RRRR_MSG(ec), PP_MSG(ec));
-               else if (MEM_ERROR(ec)) {
-                       u8 rrrr = (ec >> 4) & 0xf;
-
-                       if (rrrr >= 0x7)
-                               pr_cont(": %s error during data copyback.\n",
-                                       RRRR_MSG(ec));
-                       else if (rrrr <= 0x1)
-                               pr_cont(": %s parity/ECC error during data "
-                                       "access from L2.\n", RRRR_MSG(ec));
-                       else
-                               goto wrong_bu_mce;
-               } else
-                       goto wrong_bu_mce;
-       } else
-               goto wrong_bu_mce;
-
-       return;
-
-wrong_bu_mce:
-       pr_warning("Corrupted BU MCE info?\n");
-}
-
-static void amd_decode_ls_mce(u64 mc3_status)
-{
-       u32 ec  = mc3_status & 0xffff;
-       u32 xec = (mc3_status >> 16) & 0xf;
-
-       pr_emerg("Load Store Error");
-
-       if (xec == 0x0) {
-               u8 rrrr = (ec >> 4) & 0xf;
-
-               if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
-                       goto wrong_ls_mce;
-
-               pr_cont(" during %s.\n", RRRR_MSG(ec));
-       }
-       return;
-
-wrong_ls_mce:
-       pr_warning("Corrupted LS MCE info?\n");
-}
-
-void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
-{
-       u32 ec  = ERROR_CODE(regs->nbsl);
-
-       if (!handle_errors)
-               return;
-
-       /*
-        * GART TLB error reporting is disabled by default. Bail out early.
-        */
-       if (TLB_ERROR(ec) && !report_gart_errors)
-               return;
-
-       pr_emerg("Northbridge Error, node %d", node_id);
-
-       /*
-        * F10h, revD can disable ErrCpu[3:0] so check that first and also the
-        * value encoding has changed so interpret those differently
-        */
-       if ((boot_cpu_data.x86 == 0x10) &&
-           (boot_cpu_data.x86_model > 7)) {
-               if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
-                       pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
-       } else {
-               u8 assoc_cpus = regs->nbsh & 0xf;
-
-               if (assoc_cpus > 0)
-                       pr_cont(", core: %d", fls(assoc_cpus) - 1);
-
-               pr_cont("\n");
-       }
-
-       pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl));
-
-       if (BUS_ERROR(ec) && nb_bus_decoder)
-               nb_bus_decoder(node_id, regs);
-}
-EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
-
-static void amd_decode_fr_mce(u64 mc5_status)
-{
-       /* we have only one error signature so match all fields at once. */
-       if ((mc5_status & 0xffff) == 0x0f0f)
-               pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
-       else
-               pr_warning("Corrupted FR MCE info?\n");
-}
-
-static inline void amd_decode_err_code(unsigned int ec)
-{
-       if (TLB_ERROR(ec)) {
-               pr_emerg("Transaction: %s, Cache Level %s\n",
-                        TT_MSG(ec), LL_MSG(ec));
-       } else if (MEM_ERROR(ec)) {
-               pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
-                        RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
-       } else if (BUS_ERROR(ec)) {
-               pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
-                        "Participating Processor: %s\n",
-                         RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
-                         PP_MSG(ec));
-       } else
-               pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
-}
-
-static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
-                          void *data)
-{
-       struct mce *m = (struct mce *)data;
-       struct err_regs regs;
-       int node, ecc;
-
-       pr_emerg("MC%d_STATUS: ", m->bank);
-
-       pr_cont("%sorrected error, other errors lost: %s, "
-                "CPU context corrupt: %s",
-                ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
-                ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
-                ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
-
-       /* do the two bits[14:13] together */
-       ecc = (m->status >> 45) & 0x3;
-       if (ecc)
-               pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
-
-       pr_cont("\n");
-
-       switch (m->bank) {
-       case 0:
-               amd_decode_dc_mce(m->status);
-               break;
-
-       case 1:
-               amd_decode_ic_mce(m->status);
-               break;
-
-       case 2:
-               amd_decode_bu_mce(m->status);
-               break;
-
-       case 3:
-               amd_decode_ls_mce(m->status);
-               break;
-
-       case 4:
-               regs.nbsl  = (u32) m->status;
-               regs.nbsh  = (u32)(m->status >> 32);
-               regs.nbeal = (u32) m->addr;
-               regs.nbeah = (u32)(m->addr >> 32);
-               node       = amd_get_nb_id(m->extcpu);
-
-               amd_decode_nb_mce(node, &regs, 1);
-               break;
-
-       case 5:
-               amd_decode_fr_mce(m->status);
-               break;
-
-       default:
-               break;
-       }
-
-       amd_decode_err_code(m->status & 0xffff);
-
-       return NOTIFY_STOP;
-}
-
-static struct notifier_block amd_mce_dec_nb = {
-       .notifier_call  = amd_decode_mce,
-};
-
-static int __init mce_amd_init(void)
-{
-       /*
-        * We can decode MCEs for K8, F10h and F11h CPUs:
-        */
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-               return 0;
-
-       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
-               return 0;
-
-       atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
-
-       return 0;
-}
-early_initcall(mce_amd_init);
-
-#ifdef MODULE
-static void __exit mce_amd_exit(void)
-{
-       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
-}
-
-MODULE_DESCRIPTION("AMD MCE decoder");
-MODULE_ALIAS("edac-mce-amd");
-MODULE_LICENSE("GPL");
-module_exit(mce_amd_exit);
-#endif
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h

deleted file mode 100644 (file)

index df23ee0..0000000
--- a/drivers/edac/edac_mce_amd.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef _EDAC_MCE_AMD_H
-#define _EDAC_MCE_AMD_H
-
-#include <asm/mce.h>
-
-#define ERROR_CODE(x)                  ((x) & 0xffff)
-#define EXT_ERROR_CODE(x)              (((x) >> 16) & 0x1f)
-#define EXT_ERR_MSG(x)                 ext_msgs[EXT_ERROR_CODE(x)]
-
-#define LOW_SYNDROME(x)                        (((x) >> 15) & 0xff)
-#define HIGH_SYNDROME(x)               (((x) >> 24) & 0xff)
-
-#define TLB_ERROR(x)                   (((x) & 0xFFF0) == 0x0010)
-#define MEM_ERROR(x)                   (((x) & 0xFF00) == 0x0100)
-#define BUS_ERROR(x)                   (((x) & 0xF800) == 0x0800)
-
-#define TT(x)                          (((x) >> 2) & 0x3)
-#define TT_MSG(x)                      tt_msgs[TT(x)]
-#define II(x)                          (((x) >> 2) & 0x3)
-#define II_MSG(x)                      ii_msgs[II(x)]
-#define LL(x)                          (((x) >> 0) & 0x3)
-#define LL_MSG(x)                      ll_msgs[LL(x)]
-#define RRRR(x)                                (((x) >> 4) & 0xf)
-#define RRRR_MSG(x)                    rrrr_msgs[RRRR(x)]
-#define TO(x)                          (((x) >> 8) & 0x1)
-#define TO_MSG(x)                      to_msgs[TO(x)]
-#define PP(x)                          (((x) >> 9) & 0x3)
-#define PP_MSG(x)                      pp_msgs[PP(x)]
-
-#define K8_NBSH                                0x4C
-
-#define K8_NBSH_VALID_BIT              BIT(31)
-#define K8_NBSH_OVERFLOW               BIT(30)
-#define K8_NBSH_UC_ERR                 BIT(29)
-#define K8_NBSH_ERR_EN                 BIT(28)
-#define K8_NBSH_MISCV                  BIT(27)
-#define K8_NBSH_VALID_ERROR_ADDR       BIT(26)
-#define K8_NBSH_PCC                    BIT(25)
-#define K8_NBSH_ERR_CPU_VAL            BIT(24)
-#define K8_NBSH_CECC                   BIT(14)
-#define K8_NBSH_UECC                   BIT(13)
-#define K8_NBSH_ERR_SCRUBER            BIT(8)
-
-extern const char *tt_msgs[];
-extern const char *ll_msgs[];
-extern const char *rrrr_msgs[];
-extern const char *pp_msgs[];
-extern const char *to_msgs[];
-extern const char *ii_msgs[];
-extern const char *ext_msgs[];
-
-/*
- * relevant NB regs
- */
-struct err_regs {
-       u32 nbcfg;
-       u32 nbsh;
-       u32 nbsl;
-       u32 nbeah;
-       u32 nbeal;
-};
-
-
-void amd_report_gart_errors(bool);
-void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
-void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
-void amd_decode_nb_mce(int, struct err_regs *, int);
-
-#endif /* _EDAC_MCE_AMD_H */
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c

index 7e1374a..be4b075 100644 (file)
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -26,15 +26,6 @@ EXPORT_SYMBOL_GPL(edac_debug_level);
  /* scope is to module level only */
  struct workqueue_struct *edac_workqueue;
  
-/*
- * sysfs object: /sys/devices/system/edac
- *     need to export to other files in this modules
- */
-static struct sysdev_class edac_class = {
-       .name = "edac",
-};
-static int edac_class_valid;
-
  /*
   * edac_op_state_to_string()
   */
@@ -54,60 +45,6 @@ char *edac_op_state_to_string(int opstate)
         return "UNKNOWN";
  }
  
-/*
- * edac_get_edac_class()
- *
- *     return pointer to the edac class of 'edac'
- */
-struct sysdev_class *edac_get_edac_class(void)
-{
-       struct sysdev_class *classptr = NULL;
-
-       if (edac_class_valid)
-               classptr = &edac_class;
-
-       return classptr;
-}
-
-/*
- * edac_register_sysfs_edac_name()
- *
- *     register the 'edac' into /sys/devices/system
- *
- * return:
- *     0  success
- *     !0 error
- */
-static int edac_register_sysfs_edac_name(void)
-{
-       int err;
-
-       /* create the /sys/devices/system/edac directory */
-       err = sysdev_class_register(&edac_class);
-
-       if (err) {
-               debugf1("%s() error=%d\n", __func__, err);
-               return err;
-       }
-
-       edac_class_valid = 1;
-       return 0;
-}
-
-/*
- * sysdev_class_unregister()
- *
- *     unregister the 'edac' from /sys/devices/system
- */
-static void edac_unregister_sysfs_edac_name(void)
-{
-       /* only if currently registered, then unregister it */
-       if (edac_class_valid)
-               sysdev_class_unregister(&edac_class);
-
-       edac_class_valid = 0;
-}
-
  /*
   * edac_workqueue_setup
   *     initialize the edac work queue for polling operations
@@ -153,22 +90,12 @@ static int __init edac_init(void)
          */
         edac_pci_clear_parity_errors();
  
-       /*
-        * perform the registration of the /sys/devices/system/edac class object
-        */
-       if (edac_register_sysfs_edac_name()) {
-               edac_printk(KERN_ERR, EDAC_MC,
-                       "Error initializing 'edac' kobject\n");
-               err = -ENODEV;
-               goto error;
-       }
-
         /*
          * now set up the mc_kset under the edac class object
          */
         err = edac_sysfs_setup_mc_kset();
         if (err)
-               goto sysfs_setup_fail;
+               goto error;
  
         /* Setup/Initialize the workq for this core */
         err = edac_workqueue_setup();
@@ -183,9 +110,6 @@ static int __init edac_init(void)
  workq_fail:
         edac_sysfs_teardown_mc_kset();
  
-sysfs_setup_fail:
-       edac_unregister_sysfs_edac_name();
-
  error:
         return err;
  }
@@ -201,7 +125,6 @@ static void __exit edac_exit(void)
         /* tear down the various subsystems */
         edac_workqueue_teardown();
         edac_sysfs_teardown_mc_kset();
-       edac_unregister_sysfs_edac_name();
  }
  
  /*
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h

index 233d479..17aabb7 100644 (file)
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -42,7 +42,6 @@ extern void edac_device_unregister_sysfs_main_kobj(
                                 struct edac_device_ctl_info *edac_dev);
  extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev);
  extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev);
-extern struct sysdev_class *edac_get_edac_class(void);
  
  /* edac core workqueue: single CPU mode */
  extern struct workqueue_struct *edac_workqueue;
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c

index c39697d..023b01c 100644 (file)
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -7,7 +7,7 @@
   *
   */
  #include <linux/module.h>
-#include <linux/sysdev.h>
+#include <linux/edac.h>
  #include <linux/slab.h>
  #include <linux/ctype.h>
  
@@ -354,7 +354,7 @@ static int edac_pci_main_kobj_setup(void)
         /* First time, so create the main kobject and its
          * controls and atributes
          */
-       edac_class = edac_get_edac_class();
+       edac_class = edac_get_sysfs_class();
         if (edac_class == NULL) {
                 debugf1("%s() no edac_class\n", __func__);
                 err = -ENODEV;
@@ -368,7 +368,7 @@ static int edac_pci_main_kobj_setup(void)
         if (!try_module_get(THIS_MODULE)) {
                 debugf1("%s() try_module_get() failed\n", __func__);
                 err = -ENODEV;
-               goto decrement_count_fail;
+               goto mod_get_fail;
         }
  
         edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@@ -403,6 +403,9 @@ kobject_init_and_add_fail:
  kzalloc_fail:
         module_put(THIS_MODULE);
  
+mod_get_fail:
+       edac_put_sysfs_class();
+
  decrement_count_fail:
         /* if are on this error exit, nothing to tear down */
         atomic_dec(&edac_pci_sysfs_refcount);
@@ -429,6 +432,7 @@ static void edac_pci_main_kobj_teardown(void)
                         __func__);
                 kobject_put(edac_pci_top_main_kobj);
         }
+       edac_put_sysfs_class();
  }
  
  /*
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c

index 20b428a..aab9707 100644 (file)
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -3,10 +3,13 @@
   *
   * Author: Dave Jiang <djiang@mvista.com>
   *
- * 2007 (c) MontaVista Software, Inc. This file is licensed under
- * the terms of the GNU General Public License version 2. This program
- * is licensed "as is" without any warranty of any kind, whether express
- * or implied.
+ * 2007 (c) MontaVista Software, Inc.
+ * 2010 (c) Advanced Micro Devices Inc.
+ *         Borislav Petkov <borislav.petkov@amd.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
   *
   */
  #include <linux/module.h>
@@ -23,6 +26,8 @@ EXPORT_SYMBOL_GPL(edac_handlers);
  int edac_err_assert = 0;
  EXPORT_SYMBOL_GPL(edac_err_assert);
  
+static atomic_t edac_class_valid = ATOMIC_INIT(0);
+
  /*
   * called to determine if there is an EDAC driver interested in
   * knowing an event (such as NMI) occurred
@@ -44,3 +49,41 @@ void edac_atomic_assert_error(void)
         edac_err_assert++;
  }
  EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
+
+/*
+ * sysfs object: /sys/devices/system/edac
+ *     need to export to other files
+ */
+struct sysdev_class edac_class = {
+       .name = "edac",
+};
+EXPORT_SYMBOL_GPL(edac_class);
+
+/* return pointer to the 'edac' node in sysfs */
+struct sysdev_class *edac_get_sysfs_class(void)
+{
+       int err = 0;
+
+       if (atomic_read(&edac_class_valid))
+               goto out;
+
+       /* create the /sys/devices/system/edac directory */
+       err = sysdev_class_register(&edac_class);
+       if (err) {
+               printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n");
+               return NULL;
+       }
+
+out:
+       atomic_inc(&edac_class_valid);
+       return &edac_class;
+}
+EXPORT_SYMBOL_GPL(edac_get_sysfs_class);
+
+void edac_put_sysfs_class(void)
+{
+       /* last user unregisters it */
+       if (atomic_dec_and_test(&edac_class_valid))
+               sysdev_class_unregister(&edac_class);
+}
+EXPORT_SYMBOL_GPL(edac_put_sysfs_class);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c

new file mode 100644 (file)

index 0000000..c018109
--- /dev/null
+++ b/drivers/edac/mce_amd.c
@@ -0,0 +1,680 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "mce_amd.h"
+
+static struct amd_decoder_ops *fam_ops;
+
+static u8 nb_err_cpumask = 0xf;
+
+static bool report_gart_errors;
+static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
+
+void amd_report_gart_errors(bool v)
+{
+       report_gart_errors = v;
+}
+EXPORT_SYMBOL_GPL(amd_report_gart_errors);
+
+void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
+{
+       nb_bus_decoder = f;
+}
+EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
+
+void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
+{
+       if (nb_bus_decoder) {
+               WARN_ON(nb_bus_decoder != f);
+
+               nb_bus_decoder = NULL;
+       }
+}
+EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
+
+/*
+ * string representation for the different MCA reported error types, see F3x48
+ * or MSR0000_0411.
+ */
+
+/* transaction type */
+const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
+EXPORT_SYMBOL_GPL(tt_msgs);
+
+/* cache level */
+const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
+EXPORT_SYMBOL_GPL(ll_msgs);
+
+/* memory transaction type */
+const char *rrrr_msgs[] = {
+       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
+};
+EXPORT_SYMBOL_GPL(rrrr_msgs);
+
+/* participating processor */
+const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
+EXPORT_SYMBOL_GPL(pp_msgs);
+
+/* request timeout */
+const char *to_msgs[] = { "no timeout",        "timed out" };
+EXPORT_SYMBOL_GPL(to_msgs);
+
+/* memory or i/o */
+const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
+EXPORT_SYMBOL_GPL(ii_msgs);
+
+static const char *f10h_nb_mce_desc[] = {
+       "HT link data error",
+       "Protocol error (link, L3, probe filter, etc.)",
+       "Parity error in NB-internal arrays",
+       "Link Retry due to IO link transmission error",
+       "L3 ECC data cache error",
+       "ECC error in L3 cache tag",
+       "L3 LRU parity bits error",
+       "ECC Error in the Probe Filter directory"
+};
+
+static bool f12h_dc_mce(u16 ec)
+{
+       bool ret = false;
+
+       if (MEM_ERROR(ec)) {
+               u8 ll = ec & 0x3;
+               ret = true;
+
+               if (ll == LL_L2)
+                       pr_cont("during L1 linefill from L2.\n");
+               else if (ll == LL_L1)
+                       pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
+               else
+                       ret = false;
+       }
+       return ret;
+}
+
+static bool f10h_dc_mce(u16 ec)
+{
+       u8 r4  = (ec >> 4) & 0xf;
+       u8 ll  = ec & 0x3;
+
+       if (r4 == R4_GEN && ll == LL_L1) {
+               pr_cont("during data scrub.\n");
+               return true;
+       }
+       return f12h_dc_mce(ec);
+}
+
+static bool k8_dc_mce(u16 ec)
+{
+       if (BUS_ERROR(ec)) {
+               pr_cont("during system linefill.\n");
+               return true;
+       }
+
+       return f10h_dc_mce(ec);
+}
+
+static bool f14h_dc_mce(u16 ec)
+{
+       u8 r4    = (ec >> 4) & 0xf;
+       u8 ll    = ec & 0x3;
+       u8 tt    = (ec >> 2) & 0x3;
+       u8 ii    = tt;
+       bool ret = true;
+
+       if (MEM_ERROR(ec)) {
+
+               if (tt != TT_DATA || ll != LL_L1)
+                       return false;
+
+               switch (r4) {
+               case R4_DRD:
+               case R4_DWR:
+                       pr_cont("Data/Tag parity error due to %s.\n",
+                               (r4 == R4_DRD ? "load/hw prf" : "store"));
+                       break;
+               case R4_EVICT:
+                       pr_cont("Copyback parity error on a tag miss.\n");
+                       break;
+               case R4_SNOOP:
+                       pr_cont("Tag parity error during snoop.\n");
+                       break;
+               default:
+                       ret = false;
+               }
+       } else if (BUS_ERROR(ec)) {
+
+               if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
+                       return false;
+
+               pr_cont("System read data error on a ");
+
+               switch (r4) {
+               case R4_RD:
+                       pr_cont("TLB reload.\n");
+                       break;
+               case R4_DWR:
+                       pr_cont("store.\n");
+                       break;
+               case R4_DRD:
+                       pr_cont("load.\n");
+                       break;
+               default:
+                       ret = false;
+               }
+       } else {
+               ret = false;
+       }
+
+       return ret;
+}
+
+static void amd_decode_dc_mce(struct mce *m)
+{
+       u16 ec = m->status & 0xffff;
+       u8 xec = (m->status >> 16) & 0xf;
+
+       pr_emerg(HW_ERR "Data Cache Error: ");
+
+       /* TLB error signatures are the same across families */
+       if (TLB_ERROR(ec)) {
+               u8 tt = (ec >> 2) & 0x3;
+
+               if (tt == TT_DATA) {
+                       pr_cont("%s TLB %s.\n", LL_MSG(ec),
+                               (xec ? "multimatch" : "parity error"));
+                       return;
+               }
+               else
+                       goto wrong_dc_mce;
+       }
+
+       if (!fam_ops->dc_mce(ec))
+               goto wrong_dc_mce;
+
+       return;
+
+wrong_dc_mce:
+       pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
+}
+
+static bool k8_ic_mce(u16 ec)
+{
+       u8 ll    = ec & 0x3;
+       u8 r4    = (ec >> 4) & 0xf;
+       bool ret = true;
+
+       if (!MEM_ERROR(ec))
+               return false;
+
+       if (ll == 0x2)
+               pr_cont("during a linefill from L2.\n");
+       else if (ll == 0x1) {
+               switch (r4) {
+               case R4_IRD:
+                       pr_cont("Parity error during data load.\n");
+                       break;
+
+               case R4_EVICT:
+                       pr_cont("Copyback Parity/Victim error.\n");
+                       break;
+
+               case R4_SNOOP:
+                       pr_cont("Tag Snoop error.\n");
+                       break;
+
+               default:
+                       ret = false;
+                       break;
+               }
+       } else
+               ret = false;
+
+       return ret;
+}
+
+static bool f14h_ic_mce(u16 ec)
+{
+       u8 ll    = ec & 0x3;
+       u8 tt    = (ec >> 2) & 0x3;
+       u8 r4  = (ec >> 4) & 0xf;
+       bool ret = true;
+
+       if (MEM_ERROR(ec)) {
+               if (tt != 0 || ll != 1)
+                       ret = false;
+
+               if (r4 == R4_IRD)
+                       pr_cont("Data/tag array parity error for a tag hit.\n");
+               else if (r4 == R4_SNOOP)
+                       pr_cont("Tag error during snoop/victimization.\n");
+               else
+                       ret = false;
+       }
+       return ret;
+}
+
+static void amd_decode_ic_mce(struct mce *m)
+{
+       u16 ec = m->status & 0xffff;
+       u8 xec = (m->status >> 16) & 0xf;
+
+       pr_emerg(HW_ERR "Instruction Cache Error: ");
+
+       if (TLB_ERROR(ec))
+               pr_cont("%s TLB %s.\n", LL_MSG(ec),
+                       (xec ? "multimatch" : "parity error"));
+       else if (BUS_ERROR(ec)) {
+               bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
+
+               pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
+       } else if (fam_ops->ic_mce(ec))
+               ;
+       else
+               pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
+}
+
+static void amd_decode_bu_mce(struct mce *m)
+{
+       u32 ec = m->status & 0xffff;
+       u32 xec = (m->status >> 16) & 0xf;
+
+       pr_emerg(HW_ERR "Bus Unit Error");
+
+       if (xec == 0x1)
+               pr_cont(" in the write data buffers.\n");
+       else if (xec == 0x3)
+               pr_cont(" in the victim data buffers.\n");
+       else if (xec == 0x2 && MEM_ERROR(ec))
+               pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
+       else if (xec == 0x0) {
+               if (TLB_ERROR(ec))
+                       pr_cont(": %s error in a Page Descriptor Cache or "
+                               "Guest TLB.\n", TT_MSG(ec));
+               else if (BUS_ERROR(ec))
+                       pr_cont(": %s/ECC error in data read from NB: %s.\n",
+                               RRRR_MSG(ec), PP_MSG(ec));
+               else if (MEM_ERROR(ec)) {
+                       u8 rrrr = (ec >> 4) & 0xf;
+
+                       if (rrrr >= 0x7)
+                               pr_cont(": %s error during data copyback.\n",
+                                       RRRR_MSG(ec));
+                       else if (rrrr <= 0x1)
+                               pr_cont(": %s parity/ECC error during data "
+                                       "access from L2.\n", RRRR_MSG(ec));
+                       else
+                               goto wrong_bu_mce;
+               } else
+                       goto wrong_bu_mce;
+       } else
+               goto wrong_bu_mce;
+
+       return;
+
+wrong_bu_mce:
+       pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
+}
+
+static void amd_decode_ls_mce(struct mce *m)
+{
+       u16 ec = m->status & 0xffff;
+       u8 xec = (m->status >> 16) & 0xf;
+
+       if (boot_cpu_data.x86 == 0x14) {
+               pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
+                        " please report on LKML.\n");
+               return;
+       }
+
+       pr_emerg(HW_ERR "Load Store Error");
+
+       if (xec == 0x0) {
+               u8 r4 = (ec >> 4) & 0xf;
+
+               if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
+                       goto wrong_ls_mce;
+
+               pr_cont(" during %s.\n", RRRR_MSG(ec));
+       } else
+               goto wrong_ls_mce;
+
+       return;
+
+wrong_ls_mce:
+       pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
+}
+
+static bool k8_nb_mce(u16 ec, u8 xec)
+{
+       bool ret = true;
+
+       switch (xec) {
+       case 0x1:
+               pr_cont("CRC error detected on HT link.\n");
+               break;
+
+       case 0x5:
+               pr_cont("Invalid GART PTE entry during GART table walk.\n");
+               break;
+
+       case 0x6:
+               pr_cont("Unsupported atomic RMW received from an IO link.\n");
+               break;
+
+       case 0x0:
+       case 0x8:
+               if (boot_cpu_data.x86 == 0x11)
+                       return false;
+
+               pr_cont("DRAM ECC error detected on the NB.\n");
+               break;
+
+       case 0xd:
+               pr_cont("Parity error on the DRAM addr/ctl signals.\n");
+               break;
+
+       default:
+               ret = false;
+               break;
+       }
+
+       return ret;
+}
+
+static bool f10h_nb_mce(u16 ec, u8 xec)
+{
+       bool ret = true;
+       u8 offset = 0;
+
+       if (k8_nb_mce(ec, xec))
+               return true;
+
+       switch(xec) {
+       case 0xa ... 0xc:
+               offset = 10;
+               break;
+
+       case 0xe:
+               offset = 11;
+               break;
+
+       case 0xf:
+               if (TLB_ERROR(ec))
+                       pr_cont("GART Table Walk data error.\n");
+               else if (BUS_ERROR(ec))
+                       pr_cont("DMA Exclusion Vector Table Walk error.\n");
+               else
+                       ret = false;
+
+               goto out;
+               break;
+
+       case 0x1c ... 0x1f:
+               offset = 24;
+               break;
+
+       default:
+               ret = false;
+
+               goto out;
+               break;
+       }
+
+       pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
+
+out:
+       return ret;
+}
+
+static bool nb_noop_mce(u16 ec, u8 xec)
+{
+       return false;
+}
+
+void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
+{
+       u8 xec   = (m->status >> 16) & 0x1f;
+       u16 ec   = m->status & 0xffff;
+       u32 nbsh = (u32)(m->status >> 32);
+
+       pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
+
+       /*
+        * F10h, revD can disable ErrCpu[3:0] so check that first and also the
+        * value encoding has changed so interpret those differently
+        */
+       if ((boot_cpu_data.x86 == 0x10) &&
+           (boot_cpu_data.x86_model > 7)) {
+               if (nbsh & K8_NBSH_ERR_CPU_VAL)
+                       pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
+       } else {
+               u8 assoc_cpus = nbsh & nb_err_cpumask;
+
+               if (assoc_cpus > 0)
+                       pr_cont(", core: %d", fls(assoc_cpus) - 1);
+       }
+
+       switch (xec) {
+       case 0x2:
+               pr_cont("Sync error (sync packets on HT link detected).\n");
+               return;
+
+       case 0x3:
+               pr_cont("HT Master abort.\n");
+               return;
+
+       case 0x4:
+               pr_cont("HT Target abort.\n");
+               return;
+
+       case 0x7:
+               pr_cont("NB Watchdog timeout.\n");
+               return;
+
+       case 0x9:
+               pr_cont("SVM DMA Exclusion Vector error.\n");
+               return;
+
+       default:
+               break;
+       }
+
+       if (!fam_ops->nb_mce(ec, xec))
+               goto wrong_nb_mce;
+
+       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
+               if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
+                       nb_bus_decoder(node_id, m, nbcfg);
+
+       return;
+
+wrong_nb_mce:
+       pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
+}
+EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
+
+static void amd_decode_fr_mce(struct mce *m)
+{
+       if (boot_cpu_data.x86 == 0xf ||
+           boot_cpu_data.x86 == 0x11)
+               goto wrong_fr_mce;
+
+       /* we have only one error signature so match all fields at once. */
+       if ((m->status & 0xffff) == 0x0f0f) {
+               pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
+               return;
+       }
+
+wrong_fr_mce:
+       pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
+}
+
+static inline void amd_decode_err_code(u16 ec)
+{
+       if (TLB_ERROR(ec)) {
+               pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
+                        TT_MSG(ec), LL_MSG(ec));
+       } else if (MEM_ERROR(ec)) {
+               pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
+                        RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
+       } else if (BUS_ERROR(ec)) {
+               pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
+                        "Participating Processor: %s\n",
+                         RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
+                         PP_MSG(ec));
+       } else
+               pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
+}
+
+/*
+ * Filter out unwanted MCE signatures here.
+ */
+static bool amd_filter_mce(struct mce *m)
+{
+       u8 xec = (m->status >> 16) & 0x1f;
+
+       /*
+        * NB GART TLB error reporting is disabled by default.
+        */
+       if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
+               return true;
+
+       return false;
+}
+
+int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
+{
+       struct mce *m = (struct mce *)data;
+       int node, ecc;
+
+       if (amd_filter_mce(m))
+               return NOTIFY_STOP;
+
+       pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
+
+       pr_cont("%sorrected error, other errors lost: %s, "
+                "CPU context corrupt: %s",
+                ((m->status & MCI_STATUS_UC) ? "Unc"  : "C"),
+                ((m->status & MCI_STATUS_OVER) ? "yes"  : "no"),
+                ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
+
+       /* do the two bits[14:13] together */
+       ecc = (m->status >> 45) & 0x3;
+       if (ecc)
+               pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
+
+       pr_cont("\n");
+
+       switch (m->bank) {
+       case 0:
+               amd_decode_dc_mce(m);
+               break;
+
+       case 1:
+               amd_decode_ic_mce(m);
+               break;
+
+       case 2:
+               amd_decode_bu_mce(m);
+               break;
+
+       case 3:
+               amd_decode_ls_mce(m);
+               break;
+
+       case 4:
+               node = amd_get_nb_id(m->extcpu);
+               amd_decode_nb_mce(node, m, 0);
+               break;
+
+       case 5:
+               amd_decode_fr_mce(m);
+               break;
+
+       default:
+               break;
+       }
+
+       amd_decode_err_code(m->status & 0xffff);
+
+       return NOTIFY_STOP;
+}
+EXPORT_SYMBOL_GPL(amd_decode_mce);
+
+static struct notifier_block amd_mce_dec_nb = {
+       .notifier_call  = amd_decode_mce,
+};
+
+static int __init mce_amd_init(void)
+{
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               return 0;
+
+       if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
+           (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
+               return 0;
+
+       fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
+       if (!fam_ops)
+               return -ENOMEM;
+
+       switch (boot_cpu_data.x86) {
+       case 0xf:
+               fam_ops->dc_mce = k8_dc_mce;
+               fam_ops->ic_mce = k8_ic_mce;
+               fam_ops->nb_mce = k8_nb_mce;
+               break;
+
+       case 0x10:
+               fam_ops->dc_mce = f10h_dc_mce;
+               fam_ops->ic_mce = k8_ic_mce;
+               fam_ops->nb_mce = f10h_nb_mce;
+               break;
+
+       case 0x11:
+               fam_ops->dc_mce = k8_dc_mce;
+               fam_ops->ic_mce = k8_ic_mce;
+               fam_ops->nb_mce = f10h_nb_mce;
+               break;
+
+       case 0x12:
+               fam_ops->dc_mce = f12h_dc_mce;
+               fam_ops->ic_mce = k8_ic_mce;
+               fam_ops->nb_mce = nb_noop_mce;
+               break;
+
+       case 0x14:
+               nb_err_cpumask  = 0x3;
+               fam_ops->dc_mce = f14h_dc_mce;
+               fam_ops->ic_mce = f14h_ic_mce;
+               fam_ops->nb_mce = nb_noop_mce;
+               break;
+
+       default:
+               printk(KERN_WARNING "Huh? What family is that: %d?!\n",
+                                   boot_cpu_data.x86);
+               kfree(fam_ops);
+               return -EINVAL;
+       }
+
+       pr_info("MCE: In-kernel MCE decoding enabled.\n");
+
+       atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+
+       return 0;
+}
+early_initcall(mce_amd_init);
+
+#ifdef MODULE
+static void __exit mce_amd_exit(void)
+{
+       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+       kfree(fam_ops);
+}
+
+MODULE_DESCRIPTION("AMD MCE decoder");
+MODULE_ALIAS("edac-mce-amd");
+MODULE_LICENSE("GPL");
+module_exit(mce_amd_exit);
+#endif
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h

new file mode 100644 (file)

index 0000000..35f6e0e
--- /dev/null
+++ b/drivers/edac/mce_amd.h
@@ -0,0 +1,114 @@
+#ifndef _EDAC_MCE_AMD_H
+#define _EDAC_MCE_AMD_H
+
+#include <linux/notifier.h>
+
+#include <asm/mce.h>
+
+#define BIT_64(n)                      (U64_C(1) << (n))
+
+#define ERROR_CODE(x)                  ((x) & 0xffff)
+#define EXT_ERROR_CODE(x)              (((x) >> 16) & 0x1f)
+
+#define LOW_SYNDROME(x)                        (((x) >> 15) & 0xff)
+#define HIGH_SYNDROME(x)               (((x) >> 24) & 0xff)
+
+#define TLB_ERROR(x)                   (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x)                   (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x)                   (((x) & 0xF800) == 0x0800)
+
+#define TT(x)                          (((x) >> 2) & 0x3)
+#define TT_MSG(x)                      tt_msgs[TT(x)]
+#define II(x)                          (((x) >> 2) & 0x3)
+#define II_MSG(x)                      ii_msgs[II(x)]
+#define LL(x)                          (((x) >> 0) & 0x3)
+#define LL_MSG(x)                      ll_msgs[LL(x)]
+#define TO(x)                          (((x) >> 8) & 0x1)
+#define TO_MSG(x)                      to_msgs[TO(x)]
+#define PP(x)                          (((x) >> 9) & 0x3)
+#define PP_MSG(x)                      pp_msgs[PP(x)]
+
+#define RRRR(x)                                (((x) >> 4) & 0xf)
+#define RRRR_MSG(x)                    ((RRRR(x) < 9) ?  rrrr_msgs[RRRR(x)] : "Wrong R4!")
+
+#define K8_NBSH                                0x4C
+
+#define K8_NBSH_VALID_BIT              BIT(31)
+#define K8_NBSH_OVERFLOW               BIT(30)
+#define K8_NBSH_UC_ERR                 BIT(29)
+#define K8_NBSH_ERR_EN                 BIT(28)
+#define K8_NBSH_MISCV                  BIT(27)
+#define K8_NBSH_VALID_ERROR_ADDR       BIT(26)
+#define K8_NBSH_PCC                    BIT(25)
+#define K8_NBSH_ERR_CPU_VAL            BIT(24)
+#define K8_NBSH_CECC                   BIT(14)
+#define K8_NBSH_UECC                   BIT(13)
+#define K8_NBSH_ERR_SCRUBER            BIT(8)
+
+enum tt_ids {
+       TT_INSTR = 0,
+       TT_DATA,
+       TT_GEN,
+       TT_RESV,
+};
+
+enum ll_ids {
+       LL_RESV = 0,
+       LL_L1,
+       LL_L2,
+       LL_LG,
+};
+
+enum ii_ids {
+       II_MEM = 0,
+       II_RESV,
+       II_IO,
+       II_GEN,
+};
+
+enum rrrr_ids {
+       R4_GEN  = 0,
+       R4_RD,
+       R4_WR,
+       R4_DRD,
+       R4_DWR,
+       R4_IRD,
+       R4_PREF,
+       R4_EVICT,
+       R4_SNOOP,
+};
+
+extern const char *tt_msgs[];
+extern const char *ll_msgs[];
+extern const char *rrrr_msgs[];
+extern const char *pp_msgs[];
+extern const char *to_msgs[];
+extern const char *ii_msgs[];
+
+/*
+ * relevant NB regs
+ */
+struct err_regs {
+       u32 nbcfg;
+       u32 nbsh;
+       u32 nbsl;
+       u32 nbeah;
+       u32 nbeal;
+};
+
+/*
+ * per-family decoder ops
+ */
+struct amd_decoder_ops {
+       bool (*dc_mce)(u16);
+       bool (*ic_mce)(u16);
+       bool (*nb_mce)(u16, u8);
+};
+
+void amd_report_gart_errors(bool);
+void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
+void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
+void amd_decode_nb_mce(int, struct mce *, u32);
+int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
+
+#endif /* _EDAC_MCE_AMD_H */
diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c

new file mode 100644 (file)

index 0000000..8d0688f
--- /dev/null
+++ b/drivers/edac/mce_amd_inj.c
@@ -0,0 +1,171 @@
+/*
+ * A simple MCE injection facility for testing the MCE decoding code. This
+ * driver should be built as module so that it can be loaded on production
+ * kernels for testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010:  Borislav Petkov <borislav.petkov@amd.com>
+ *                     Advanced Micro Devices Inc.
+ */
+
+#include <linux/kobject.h>
+#include <linux/sysdev.h>
+#include <linux/edac.h>
+#include <asm/mce.h>
+
+#include "mce_amd.h"
+
+struct edac_mce_attr {
+       struct attribute attr;
+       ssize_t (*show) (struct kobject *kobj, struct edac_mce_attr *attr, char *buf);
+       ssize_t (*store)(struct kobject *kobj, struct edac_mce_attr *attr,
+                        const char *buf, size_t count);
+};
+
+#define EDAC_MCE_ATTR(_name, _mode, _show, _store)                     \
+static struct edac_mce_attr mce_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+static struct kobject *mce_kobj;
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+
+#define MCE_INJECT_STORE(reg)                                          \
+static ssize_t edac_inject_##reg##_store(struct kobject *kobj,         \
+                                        struct edac_mce_attr *attr,    \
+                                        const char *data, size_t count)\
+{                                                                      \
+       int ret = 0;                                                    \
+       unsigned long value;                                            \
+                                                                       \
+       ret = strict_strtoul(data, 16, &value);                         \
+       if (ret < 0)                                                    \
+               printk(KERN_ERR "Error writing MCE " #reg " field.\n"); \
+                                                                       \
+       i_mce.reg = value;                                              \
+                                                                       \
+       return count;                                                   \
+}
+
+MCE_INJECT_STORE(status);
+MCE_INJECT_STORE(misc);
+MCE_INJECT_STORE(addr);
+
+#define MCE_INJECT_SHOW(reg)                                           \
+static ssize_t edac_inject_##reg##_show(struct kobject *kobj,          \
+                                       struct edac_mce_attr *attr,     \
+                                       char *buf)                      \
+{                                                                      \
+       return sprintf(buf, "0x%016llx\n", i_mce.reg);                  \
+}
+
+MCE_INJECT_SHOW(status);
+MCE_INJECT_SHOW(misc);
+MCE_INJECT_SHOW(addr);
+
+EDAC_MCE_ATTR(status, 0644, edac_inject_status_show, edac_inject_status_store);
+EDAC_MCE_ATTR(misc, 0644, edac_inject_misc_show, edac_inject_misc_store);
+EDAC_MCE_ATTR(addr, 0644, edac_inject_addr_show, edac_inject_addr_store);
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static ssize_t edac_inject_bank_store(struct kobject *kobj,
+                                     struct edac_mce_attr *attr,
+                                     const char *data, size_t count)
+{
+       int ret = 0;
+       unsigned long value;
+
+       ret = strict_strtoul(data, 10, &value);
+       if (ret < 0) {
+               printk(KERN_ERR "Invalid bank value!\n");
+               return -EINVAL;
+       }
+
+       if (value > 5) {
+               printk(KERN_ERR "Non-existant MCE bank: %lu\n", value);
+               return -EINVAL;
+       }
+
+       i_mce.bank = value;
+
+       amd_decode_mce(NULL, 0, &i_mce);
+
+       return count;
+}
+
+static ssize_t edac_inject_bank_show(struct kobject *kobj,
+                                    struct edac_mce_attr *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", i_mce.bank);
+}
+
+EDAC_MCE_ATTR(bank, 0644, edac_inject_bank_show, edac_inject_bank_store);
+
+static struct edac_mce_attr *sysfs_attrs[] = { &mce_attr_status, &mce_attr_misc,
+                                              &mce_attr_addr, &mce_attr_bank
+};
+
+static int __init edac_init_mce_inject(void)
+{
+       struct sysdev_class *edac_class = NULL;
+       int i, err = 0;
+
+       edac_class = edac_get_sysfs_class();
+       if (!edac_class)
+               return -EINVAL;
+
+       mce_kobj = kobject_create_and_add("mce", &edac_class->kset.kobj);
+       if (!mce_kobj) {
+               printk(KERN_ERR "Error creating a mce kset.\n");
+               err = -ENOMEM;
+               goto err_mce_kobj;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) {
+               err = sysfs_create_file(mce_kobj, &sysfs_attrs[i]->attr);
+               if (err) {
+                       printk(KERN_ERR "Error creating %s in sysfs.\n",
+                                       sysfs_attrs[i]->attr.name);
+                       goto err_sysfs_create;
+               }
+       }
+       return 0;
+
+err_sysfs_create:
+       while (i-- >= 0)
+               sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
+
+       kobject_del(mce_kobj);
+
+err_mce_kobj:
+       edac_put_sysfs_class();
+
+       return err;
+}
+
+static void __exit edac_exit_mce_inject(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++)
+               sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
+
+       kobject_del(mce_kobj);
+
+       edac_put_sysfs_class();
+}
+
+module_init(edac_init_mce_inject);
+module_exit(edac_exit_mce_inject);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Borislav Petkov <borislav.petkov@amd.com>");
+MODULE_AUTHOR("AMD Inc.");
+MODULE_DESCRIPTION("MCE injection facility for testing MCE decoding");
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig

index 280c9b5..88a3ae6 100644 (file)
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -125,7 +125,7 @@ config ISCSI_IBFT_FIND
  config ISCSI_IBFT
         tristate "iSCSI Boot Firmware Table Attributes module"
         select ISCSI_BOOT_SYSFS
-       depends on ISCSI_IBFT_FIND && SCSI
+       depends on ISCSI_IBFT_FIND && SCSI && SCSI_LOWLEVEL
         default n
         help
           This option enables support for detection and exposing of iSCSI
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c

index c37ef64..cb3ccf3 100644 (file)
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -59,18 +59,11 @@
  #include <linux/hrtimer.h>     /* ktime_get_real() */
  #include <trace/events/power.h>
  #include <linux/sched.h>
+#include <asm/mwait.h>
  
  #define INTEL_IDLE_VERSION "0.4"
  #define PREFIX "intel_idle: "
  
-#define MWAIT_SUBSTATE_MASK    (0xf)
-#define MWAIT_CSTATE_MASK      (0xf)
-#define MWAIT_SUBSTATE_SIZE    (4)
-#define MWAIT_MAX_NUM_CSTATES  8
-#define CPUID_MWAIT_LEAF (5)
-#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
-#define CPUID5_ECX_INTERRUPT_BREAK     (0x2)
-
  static struct cpuidle_driver intel_idle_driver = {
         .name = "intel_idle",
         .owner = THIS_MODULE,
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c

index 9ddafc3..af9ee31 100644 (file)
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
         int minor;
         struct input_handle handle;
         wait_queue_head_t wait;
-       struct evdev_client *grab;
+       struct evdev_client __rcu *grab;
         struct list_head client_list;
         spinlock_t client_lock; /* protects client_list */
         struct mutex mutex;
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c

index c190664..7e2c12a 100644 (file)
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
         t.endidx =              91;
         t.seq =                 tseq;
         t.act.semaphore =       &tsem;
-       init_MUTEX_LOCKED(&tsem);
+       sema_init(&tsem, 0);
         
         if (hp_sdc_enqueue_transaction(&t)) return -1;
         
@@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void)
                 return -ENODEV;
  #endif
  
-       init_MUTEX(&i8042tregs);
+       sema_init(&i8042tregs, 1);
  
         if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
                 return ret;
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c

index c92f4ed..e5624d8 100644 (file)
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc)
         mlc->ostarted = 0;
  
         rwlock_init(&mlc->lock);
-       init_MUTEX(&mlc->osem);
+       sema_init(&mlc->osem, 1);
  
-       init_MUTEX(&mlc->isem);
+       sema_init(&mlc->isem, 1);
         mlc->icount = -1;
         mlc->imatch = 0;
  
         mlc->opercnt = 0;
  
-       init_MUTEX_LOCKED(&(mlc->csem));
+       sema_init(&(mlc->csem), 0);
  
         hil_mlc_clear_di_scratch(mlc);
         hil_mlc_clear_di_map(mlc, 0);
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c

index bcc2d30..8c0b51c 100644 (file)
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -905,7 +905,7 @@ static int __init hp_sdc_init(void)
         ts_sync[1]      = 0x0f;
         ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0;
         t_sync.act.semaphore = &s_sync;
-       init_MUTEX_LOCKED(&s_sync);
+       sema_init(&s_sync, 0);
         hp_sdc_enqueue_transaction(&t_sync);
         down(&s_sync); /* Wait for t_sync to complete */
  
@@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void)
                 return hp_sdc.dev_err;
         }
  
-       init_MUTEX_LOCKED(&tq_init_sem);
+       sema_init(&tq_init_sem, 0);
  
         tq_init.actidx          = 0;
         tq_init.idx             = 1;
diff --git a/drivers/isdn/act2000/act2000.h b/drivers/isdn/act2000/act2000.h

index d4c5051..88c9423 100644 (file)
--- a/drivers/isdn/act2000/act2000.h
+++ b/drivers/isdn/act2000/act2000.h
@@ -141,9 +141,9 @@ typedef struct irq_data_isa {
         __u8            rcvhdr[8];
  } irq_data_isa;
  
-typedef union irq_data {
+typedef union act2000_irq_data {
         irq_data_isa isa;
-} irq_data;
+} act2000_irq_data;
  
  /*
   * Per card driver data
@@ -176,7 +176,7 @@ typedef struct act2000_card {
         char   *status_buf_read;
         char   *status_buf_write;
         char   *status_buf_end;
-       irq_data idat;                  /* Data used for IRQ handler        */
+       act2000_irq_data idat;          /* Data used for IRQ handler        */
         isdn_if interface;              /* Interface to upper layer         */
         char regname[35];               /* Name used for request_region     */
  } act2000_card;
diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c

index 6f9afcd..b133378 100644 (file)
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -801,6 +801,16 @@ static void closecard(int cardnr)
         ll_unload(csta);
  }
  
+static irqreturn_t card_irq(int intno, void *dev_id)
+{
+       struct IsdnCardState *cs = dev_id;
+       irqreturn_t ret = cs->irq_func(intno, cs);
+
+       if (ret == IRQ_HANDLED)
+               cs->irq_cnt++;
+       return ret;
+}
+
  static int init_card(struct IsdnCardState *cs)
  {
         int     irq_cnt, cnt = 3, ret;
@@ -809,10 +819,10 @@ static int init_card(struct IsdnCardState *cs)
                 ret = cs->cardmsg(cs, CARD_INIT, NULL);
                 return(ret);
         }
-       irq_cnt = kstat_irqs(cs->irq);
+       irq_cnt = cs->irq_cnt = 0;
         printk(KERN_INFO "%s: IRQ %d count %d\n", CardType[cs->typ],
                cs->irq, irq_cnt);
-       if (request_irq(cs->irq, cs->irq_func, cs->irq_flags, "HiSax", cs)) {
+       if (request_irq(cs->irq, card_irq, cs->irq_flags, "HiSax", cs)) {
                 printk(KERN_WARNING "HiSax: couldn't get interrupt %d\n",
                        cs->irq);
                 return 1;
@@ -822,8 +832,8 @@ static int init_card(struct IsdnCardState *cs)
                 /* Timeout 10ms */
                 msleep(10);
                 printk(KERN_INFO "%s: IRQ %d count %d\n",
-                      CardType[cs->typ], cs->irq, kstat_irqs(cs->irq));
-               if (kstat_irqs(cs->irq) == irq_cnt) {
+                      CardType[cs->typ], cs->irq, cs->irq_cnt);
+               if (cs->irq_cnt == irq_cnt) {
                         printk(KERN_WARNING
                                "%s: IRQ(%d) getting no interrupts during init %d\n",
                                CardType[cs->typ], cs->irq, 4 - cnt);
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h

index 832a878..32ab392 100644 (file)
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -959,6 +959,7 @@ struct IsdnCardState {
         u_long          event;
         struct work_struct tqueue;
         struct timer_list dbusytimer;
+       unsigned int    irq_cnt;
  #ifdef ERROR_STATISTIC
         int             err_crc;
         int             err_tx;
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c

index 1c4ee6e..bf64e49 100644 (file)
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -83,7 +83,7 @@ static struct adb_driver *adb_controller;
  BLOCKING_NOTIFIER_HEAD(adb_client_list);
  static int adb_got_sleep;
  static int adb_inited;
-static DECLARE_MUTEX(adb_probe_mutex);
+static DEFINE_SEMAPHORE(adb_probe_mutex);
  static int sleepy_trackpad;
  static int autopoll_devs;
  int __adb_probe_sync;
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c

index 097f24d..b9fda70 100644 (file)
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -78,7 +78,7 @@ struct sih {
         u8      irq_lines;              /* number of supported irq lines */
  
         /* SIR ignored -- set interrupt, for testing only */
-       struct irq_data {
+       struct sih_irq_data {
                 u8      isr_offset;
                 u8      imr_offset;
         } mask[2];
@@ -810,7 +810,7 @@ int twl4030_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
         twl4030_irq_chip = dummy_irq_chip;
         twl4030_irq_chip.name = "twl4030";
  
-       twl4030_sih_irq_chip.ack = dummy_irq_chip.ack;
+       twl4030_sih_irq_chip.irq_ack = dummy_irq_chip.irq_ack;
  
         for (i = irq_base; i < irq_end; i++) {
                 set_irq_chip_and_handler(i, &twl4030_irq_chip,
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c

index 70705d1..eca55c5 100644 (file)
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot)
         lp->tx_len              = lp->exec_box->data[9];   /* Transmit list count */
         lp->rx_len              = lp->exec_box->data[11];  /* Receive list count */
  
-       init_MUTEX_LOCKED(&lp->cmd_mutex);
+       sema_init(&lp->cmd_mutex, 0);
         init_completion(&lp->execution_cmd);
         init_completion(&lp->xceiver_cmd);
  
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c

index 4b52c76..3e5d0b6 100644 (file)
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty)
  
         spin_lock_init(&sp->lock);
         atomic_set(&sp->refcnt, 1);
-       init_MUTEX_LOCKED(&sp->dead_sem);
+       sema_init(&sp->dead_sem, 0);
  
         /* !!! length of the buffers. MTU is IP MTU, not PACLEN!  */
  
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c

index 66e88bd..4c62839 100644 (file)
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty)
  
         spin_lock_init(&ax->buflock);
         atomic_set(&ax->refcnt, 1);
-       init_MUTEX_LOCKED(&ax->dead_sem);
+       sema_init(&ax->dead_sem, 0);
  
         ax->tty = tty;
         tty->disc_data = ax;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c

index 1b051da..51d7444 100644 (file)
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n
         dev->tx_skb = NULL;
  
         spin_lock_init(&dev->tx_lock);
-       init_MUTEX(&dev->fsm.sem);
+       sema_init(&dev->fsm.sem, 1);
  
         dev->drv = drv;
         dev->netdev = ndev;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c

index af50a53..78d70a6 100644 (file)
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty)
         tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap);
  
         atomic_set(&ap->refcnt, 1);
-       init_MUTEX_LOCKED(&ap->dead_sem);
+       sema_init(&ap->dead_sem, 0);
  
         ap->chan.private = ap;
         ap->chan.ops = &async_ops;
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c

index 04c6cd4..10bafd5 100644 (file)
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma)
  
                 /* Initialize the chardev data structures */
                 mutex_init(&chan->rlock);
-               init_MUTEX(&chan->wsem);
+               sema_init(&chan->wsem, 1);
  
                 /* Register the network interface */
                 if (!(chan->netdev = alloc_hdlcdev(chan))) {
diff --git a/drivers/parport/share.c b/drivers/parport/share.c

index dffa5d4..a2d9d1e 100644 (file)
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
         spin_lock_init(&tmp->pardevice_lock);
         tmp->ieee1284.mode = IEEE1284_MODE_COMPAT;
         tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
-       init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */
+       sema_init(&tmp->ieee1284.irq, 0);
         tmp->spintime = parport_default_spintime;
         atomic_set (&tmp->ref_count, 1);
         INIT_LIST_HEAD(&tmp->full_list);
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c

index 0a19708..3de3a43 100644 (file)
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -1221,9 +1221,9 @@ const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
         }
  }
  
-void dmar_msi_unmask(unsigned int irq)
+void dmar_msi_unmask(struct irq_data *data)
  {
-       struct intel_iommu *iommu = get_irq_data(irq);
+       struct intel_iommu *iommu = irq_data_get_irq_data(data);
         unsigned long flag;
  
         /* unmask it */
@@ -1234,10 +1234,10 @@ void dmar_msi_unmask(unsigned int irq)
         spin_unlock_irqrestore(&iommu->register_lock, flag);
  }
  
-void dmar_msi_mask(unsigned int irq)
+void dmar_msi_mask(struct irq_data *data)
  {
         unsigned long flag;
-       struct intel_iommu *iommu = get_irq_data(irq);
+       struct intel_iommu *iommu = irq_data_get_irq_data(data);
  
         /* mask it */
         spin_lock_irqsave(&iommu->register_lock, flag);
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c

index 98abf8b..834842a 100644 (file)
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -57,28 +57,22 @@ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
         *msg = cfg->msg;
  }
  
-void mask_ht_irq(unsigned int irq)
+void mask_ht_irq(struct irq_data *data)
  {
-       struct ht_irq_cfg *cfg;
-       struct ht_irq_msg msg;
-
-       cfg = get_irq_data(irq);
+       struct ht_irq_cfg *cfg = irq_data_get_irq_data(data);
+       struct ht_irq_msg msg = cfg->msg;
  
-       msg = cfg->msg;
         msg.address_lo |= 1;
-       write_ht_irq_msg(irq, &msg);
+       write_ht_irq_msg(data->irq, &msg);
  }
  
-void unmask_ht_irq(unsigned int irq)
+void unmask_ht_irq(struct irq_data *data)
  {
-       struct ht_irq_cfg *cfg;
-       struct ht_irq_msg msg;
-
-       cfg = get_irq_data(irq);
+       struct ht_irq_cfg *cfg = irq_data_get_irq_data(data);
+       struct ht_irq_msg msg = cfg->msg;
  
-       msg = cfg->msg;
         msg.address_lo &= ~1;
-       write_ht_irq_msg(irq, &msg);
+       write_ht_irq_msg(data->irq, &msg);
  }
  
  /**
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c

index fd1d286..ec87cd6 100644 (file)
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -46,109 +46,24 @@ static __init int setup_intremap(char *str)
  }
  early_param("intremap", setup_intremap);
  
-struct irq_2_iommu {
-       struct intel_iommu *iommu;
-       u16 irte_index;
-       u16 sub_handle;
-       u8  irte_mask;
-};
-
-#ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
-{
-       struct irq_2_iommu *iommu;
-
-       iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-       printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
-
-       return iommu;
-}
-
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-
-       if (WARN_ON_ONCE(!desc))
-               return NULL;
-
-       return desc->irq_2_iommu;
-}
-
-static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
-{
-       struct irq_desc *desc;
-       struct irq_2_iommu *irq_iommu;
-
-       desc = irq_to_desc(irq);
-       if (!desc) {
-               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-               return NULL;
-       }
-
-       irq_iommu = desc->irq_2_iommu;
-
-       if (!irq_iommu)
-               desc->irq_2_iommu = get_one_free_irq_2_iommu(irq_node(irq));
-
-       return desc->irq_2_iommu;
-}
-
-#else /* !CONFIG_SPARSE_IRQ */
-
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
-
-static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
-{
-       if (irq < nr_irqs)
-               return &irq_2_iommuX[irq];
-
-       return NULL;
-}
-static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
-{
-       return irq_2_iommu(irq);
-}
-#endif
-
  static DEFINE_SPINLOCK(irq_2_ir_lock);
  
-static struct irq_2_iommu *valid_irq_2_iommu(unsigned int irq)
-{
-       struct irq_2_iommu *irq_iommu;
-
-       irq_iommu = irq_2_iommu(irq);
-
-       if (!irq_iommu)
-               return NULL;
-
-       if (!irq_iommu->iommu)
-               return NULL;
-
-       return irq_iommu;
-}
-
-int irq_remapped(int irq)
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
  {
-       return valid_irq_2_iommu(irq) != NULL;
+       struct irq_cfg *cfg = get_irq_chip_data(irq);
+       return cfg ? &cfg->irq_2_iommu : NULL;
  }
  
  int get_irte(int irq, struct irte *entry)
  {
-       int index;
-       struct irq_2_iommu *irq_iommu;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         unsigned long flags;
+       int index;
  
-       if (!entry)
+       if (!entry || !irq_iommu)
                 return -1;
  
         spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-               return -1;
-       }
  
         index = irq_iommu->irte_index + irq_iommu->sub_handle;
         *entry = *(irq_iommu->iommu->ir_table->base + index);
@@ -160,20 +75,14 @@ int get_irte(int irq, struct irte *entry)
  int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
  {
         struct ir_table *table = iommu->ir_table;
-       struct irq_2_iommu *irq_iommu;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         u16 index, start_index;
         unsigned int mask = 0;
         unsigned long flags;
         int i;
  
-       if (!count)
-               return -1;
-
-#ifndef CONFIG_SPARSE_IRQ
-       /* protect irq_2_iommu_alloc later */
-       if (irq >= nr_irqs)
+       if (!count || !irq_iommu)
                 return -1;
-#endif
  
         /*
          * start the IRTE search from index 0.
@@ -214,13 +123,6 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
         for (i = index; i < index + count; i++)
                 table->base[i].present = 1;
  
-       irq_iommu = irq_2_iommu_alloc(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-               printk(KERN_ERR "can't allocate irq_2_iommu\n");
-               return -1;
-       }
-
         irq_iommu->iommu = iommu;
         irq_iommu->irte_index =  index;
         irq_iommu->sub_handle = 0;
@@ -244,17 +146,14 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
  
  int map_irq_to_irte_handle(int irq, u16 *sub_handle)
  {
-       int index;
-       struct irq_2_iommu *irq_iommu;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         unsigned long flags;
+       int index;
  
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+       if (!irq_iommu)
                 return -1;
-       }
  
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
         *sub_handle = irq_iommu->sub_handle;
         index = irq_iommu->irte_index;
         spin_unlock_irqrestore(&irq_2_ir_lock, flags);
@@ -263,18 +162,13 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
  
  int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
  {
-       struct irq_2_iommu *irq_iommu;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         unsigned long flags;
  
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-
-       irq_iommu = irq_2_iommu_alloc(irq);
-
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-               printk(KERN_ERR "can't allocate irq_2_iommu\n");
+       if (!irq_iommu)
                 return -1;
-       }
+
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
  
         irq_iommu->iommu = iommu;
         irq_iommu->irte_index = index;
@@ -286,43 +180,18 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
         return 0;
  }
  
-int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
-{
-       struct irq_2_iommu *irq_iommu;
-       unsigned long flags;
-
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-               return -1;
-       }
-
-       irq_iommu->iommu = NULL;
-       irq_iommu->irte_index = 0;
-       irq_iommu->sub_handle = 0;
-       irq_2_iommu(irq)->irte_mask = 0;
-
-       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-       return 0;
-}
-
  int modify_irte(int irq, struct irte *irte_modified)
  {
-       int rc;
-       int index;
-       struct irte *irte;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         struct intel_iommu *iommu;
-       struct irq_2_iommu *irq_iommu;
         unsigned long flags;
+       struct irte *irte;
+       int rc, index;
  
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+       if (!irq_iommu)
                 return -1;
-       }
+
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
  
         iommu = irq_iommu->iommu;
  
@@ -339,31 +208,6 @@ int modify_irte(int irq, struct irte *irte_modified)
         return rc;
  }
  
-int flush_irte(int irq)
-{
-       int rc;
-       int index;
-       struct intel_iommu *iommu;
-       struct irq_2_iommu *irq_iommu;
-       unsigned long flags;
-
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-               return -1;
-       }
-
-       iommu = irq_iommu->iommu;
-
-       index = irq_iommu->irte_index + irq_iommu->sub_handle;
-
-       rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
-       spin_unlock_irqrestore(&irq_2_ir_lock, flags);
-
-       return rc;
-}
-
  struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
  {
         int i;
@@ -420,16 +264,14 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
  
  int free_irte(int irq)
  {
-       int rc = 0;
-       struct irq_2_iommu *irq_iommu;
+       struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
         unsigned long flags;
+       int rc;
  
-       spin_lock_irqsave(&irq_2_ir_lock, flags);
-       irq_iommu = valid_irq_2_iommu(irq);
-       if (!irq_iommu) {
-               spin_unlock_irqrestore(&irq_2_ir_lock, flags);
+       if (!irq_iommu)
                 return -1;
-       }
+
+       spin_lock_irqsave(&irq_2_ir_lock, flags);
  
         rc = clear_entries(irq_iommu);
  
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c

index 69b7be3..5fcf5ae 100644 (file)
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -170,33 +170,31 @@ static void msix_mask_irq(struct msi_desc *desc, u32 flag)
         desc->masked = __msix_mask_irq(desc, flag);
  }
  
-static void msi_set_mask_bit(unsigned irq, u32 flag)
+static void msi_set_mask_bit(struct irq_data *data, u32 flag)
  {
-       struct msi_desc *desc = get_irq_msi(irq);
+       struct msi_desc *desc = irq_data_get_msi(data);
  
         if (desc->msi_attrib.is_msix) {
                 msix_mask_irq(desc, flag);
                 readl(desc->mask_base);         /* Flush write to device */
         } else {
-               unsigned offset = irq - desc->dev->irq;
+               unsigned offset = data->irq - desc->dev->irq;
                 msi_mask_irq(desc, 1 << offset, flag << offset);
         }
  }
  
-void mask_msi_irq(unsigned int irq)
+void mask_msi_irq(struct irq_data *data)
  {
-       msi_set_mask_bit(irq, 1);
+       msi_set_mask_bit(data, 1);
  }
  
-void unmask_msi_irq(unsigned int irq)
+void unmask_msi_irq(struct irq_data *data)
  {
-       msi_set_mask_bit(irq, 0);
+       msi_set_mask_bit(data, 0);
  }
  
-void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
  {
-       struct msi_desc *entry = get_irq_desc_msi(desc);
-
         BUG_ON(entry->dev->current_state != PCI_D0);
  
         if (entry->msi_attrib.is_msix) {
@@ -227,15 +225,13 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
  
  void read_msi_msg(unsigned int irq, struct msi_msg *msg)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct msi_desc *entry = get_irq_msi(irq);
  
-       read_msi_msg_desc(desc, msg);
+       __read_msi_msg(entry, msg);
  }
  
-void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
  {
-       struct msi_desc *entry = get_irq_desc_msi(desc);
-
         /* Assert that the cache is valid, assuming that
          * valid messages are not all-zeroes. */
         BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
@@ -246,15 +242,13 @@ void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
  
  void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct msi_desc *entry = get_irq_msi(irq);
  
-       get_cached_msi_msg_desc(desc, msg);
+       __get_cached_msi_msg(entry, msg);
  }
  
-void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
  {
-       struct msi_desc *entry = get_irq_desc_msi(desc);
-
         if (entry->dev->current_state != PCI_D0) {
                 /* Don't touch the hardware now */
         } else if (entry->msi_attrib.is_msix) {
@@ -292,9 +286,9 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
  
  void write_msi_msg(unsigned int irq, struct msi_msg *msg)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct msi_desc *entry = get_irq_msi(irq);
  
-       write_msi_msg_desc(desc, msg);
+       __write_msi_msg(entry, msg);
  }
  
  static void free_msi_irqs(struct pci_dev *dev)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c

index 7c80082..17927b1 100644 (file)
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
         size_t len, total_len = 0;
         int err, wmem;
         size_t hdr_size;
-       struct socket *sock = rcu_dereference(vq->private_data);
+       struct socket *sock;
+
+       sock = rcu_dereference_check(vq->private_data,
+                                    lockdep_is_held(&vq->mutex));
         if (!sock)
                 return;
  
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
  static void vhost_net_enable_vq(struct vhost_net *n,
                                 struct vhost_virtqueue *vq)
  {
-       struct socket *sock = vq->private_data;
+       struct socket *sock;
+
+       sock = rcu_dereference_protected(vq->private_data,
+                                        lockdep_is_held(&vq->mutex));
         if (!sock)
                 return;
         if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
         struct socket *sock;
  
         mutex_lock(&vq->mutex);
-       sock = vq->private_data;
+       sock = rcu_dereference_protected(vq->private_data,
+                                        lockdep_is_held(&vq->mutex));
         vhost_net_disable_vq(n, vq);
         rcu_assign_pointer(vq->private_data, NULL);
         mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
         }
  
         /* start polling new socket */
-       oldsock = vq->private_data;
+       oldsock = rcu_dereference_protected(vq->private_data,
+                                           lockdep_is_held(&vq->mutex));
         if (sock != oldsock) {
                  vhost_net_disable_vq(n, vq);
                  rcu_assign_pointer(vq->private_data, sock);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c

index dd3d6f7..8b5a1b3 100644 (file)
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
         vhost_dev_cleanup(dev);
  
         memory->nregions = 0;
-       dev->memory = memory;
+       RCU_INIT_POINTER(dev->memory, memory);
         return 0;
  }
  
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
                 fput(dev->log_file);
         dev->log_file = NULL;
         /* No one will access memory at this point */
-       kfree(dev->memory);
-       dev->memory = NULL;
+       kfree(rcu_dereference_protected(dev->memory,
+                                       lockdep_is_held(&dev->mutex)));
+       RCU_INIT_POINTER(dev->memory, NULL);
         if (dev->mm)
                 mmput(dev->mm);
         dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
  /* Caller should have device mutex but not vq mutex */
  int vhost_log_access_ok(struct vhost_dev *dev)
  {
-       return memory_access_ok(dev, dev->memory, 1);
+       struct vhost_memory *mp;
+
+       mp = rcu_dereference_protected(dev->memory,
+                                      lockdep_is_held(&dev->mutex));
+       return memory_access_ok(dev, mp, 1);
  }
  
  /* Verify access for write logging. */
  /* Caller should have vq mutex and device mutex */
  static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
  {
-       return vq_memory_access_ok(log_base, vq->dev->memory,
+       struct vhost_memory *mp;
+
+       mp = rcu_dereference_protected(vq->dev->memory,
+                                      lockdep_is_held(&vq->mutex));
+       return vq_memory_access_ok(log_base, mp,
                             vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
                 (!vq->log_used || log_access_ok(log_base, vq->log_addr,
                                         sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
                 kfree(newmem);
                 return -EFAULT;
         }
-       oldmem = d->memory;
+       oldmem = rcu_dereference_protected(d->memory,
+                                          lockdep_is_held(&d->mutex));
         rcu_assign_pointer(d->memory, newmem);
         synchronize_rcu();
         kfree(oldmem);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h

index afd7729..af3c11d 100644 (file)
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
          * vhost_work execution acts instead of rcu_read_lock() and the end of
          * vhost_work execution acts instead of rcu_read_lock().
          * Writers use virtqueue mutex. */
-       void *private_data;
+       void __rcu *private_data;
         /* Log write descriptors */
         void __user *log_base;
         struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
         /* Readers use RCU to access memory table pointer
          * log base pointer and features.
          * Writers use mutex below.*/
-       struct vhost_memory *memory;
+       struct vhost_memory __rcu *memory;
         struct mm_struct *mm;
         struct mutex mutex;
         unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
  
  static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
  {
-       unsigned acked_features = rcu_dereference(dev->acked_features);
+       unsigned acked_features;
+
+       acked_features =
+               rcu_dereference_index_check(dev->acked_features,
+                                           lockdep_is_held(&dev->mutex));
         return acked_features & (1 << bit);
  }
  
diff --git a/drivers/xen/events.c b/drivers/xen/events.c

index 13365ba..7d24b0d 100644 (file)
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -338,30 +338,29 @@ static void unmask_evtchn(int port)
  
  static int find_unbound_irq(void)
  {
-       int irq;
-       struct irq_desc *desc;
+       struct irq_data *data;
+       int irq, res;
  
         for (irq = 0; irq < nr_irqs; irq++) {
-               desc = irq_to_desc(irq);
+               data = irq_get_irq_data(irq);
                 /* only 0->15 have init'd desc; handle irq > 16 */
-               if (desc == NULL)
+               if (!data)
                         break;
-               if (desc->chip == &no_irq_chip)
+               if (data->chip == &no_irq_chip)
                         break;
-               if (desc->chip != &xen_dynamic_chip)
+               if (data->chip != &xen_dynamic_chip)
                         continue;
                 if (irq_info[irq].type == IRQT_UNBOUND)
-                       break;
+                       return irq;
         }
  
         if (irq == nr_irqs)
                 panic("No available IRQ to bind to: increase nr_irqs!\n");
  
-       desc = irq_to_desc_alloc_node(irq, 0);
-       if (WARN_ON(desc == NULL))
-               return -1;
+       res = irq_alloc_desc_at(irq, 0);
  
-       dynamic_irq_init_keep_chip_data(irq);
+       if (WARN_ON(res != irq))
+               return -1;
  
         return irq;
  }
@@ -495,7 +494,7 @@ static void unbind_from_irq(unsigned int irq)
         if (irq_info[irq].type != IRQT_UNBOUND) {
                 irq_info[irq] = mk_unbound_info();
  
-               dynamic_irq_cleanup(irq);
+               irq_free_desc(irq);
         }
  
         spin_unlock(&irq_mapping_update_lock);
diff --git a/fs/affs/super.c b/fs/affs/super.c

index 33c4e7e..9581ea9 100644 (file)
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
  {
         struct affs_inode_info *ei = (struct affs_inode_info *) foo;
  
-       init_MUTEX(&ei->i_link_lock);
-       init_MUTEX(&ei->i_ext_lock);
+       sema_init(&ei->i_link_lock, 1);
+       sema_init(&ei->i_ext_lock, 1);
         inode_init_once(&ei->vfs_inode);
  }
  
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig

index 0fcd264..9eb134e 100644 (file)
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
  config CEPH_FS
          tristate "Ceph distributed file system (EXPERIMENTAL)"
         depends on INET && EXPERIMENTAL
+       select CEPH_LIB
         select LIBCRC32C
         select CRYPTO_AES
         select CRYPTO
+       default n
         help
           Choose Y or M here to include support for mounting the
           experimental Ceph distributed file system.  Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
  
           If unsure, say N.
  
-config CEPH_FS_PRETTYDEBUG
-       bool "Include file:line in ceph debug output"
-       depends on CEPH_FS
-       default n
-       help
-         If you say Y here, debug output will include a filename and
-         line to aid debugging.  This icnreases kernel size and slows
-         execution slightly when debug call sites are enabled (e.g.,
-         via CONFIG_DYNAMIC_DEBUG).
-
-         If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile

index 278e117..9e6c4f2 100644 (file)
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
  
  ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
         export.o caps.o snap.o xattr.o \
-       messenger.o msgpool.o buffer.o pagelist.o \
-       mds_client.o mdsmap.o \
-       mon_client.o \
-       osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
-       debugfs.o \
-       auth.o auth_none.o \
-       crypto.o armor.o \
-       auth_x.o \
-       ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+       mds_client.o mdsmap.o strings.o ceph_frag.o \
+       debugfs.o
  
  else
  #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README

deleted file mode 100644 (file)

index 18352fa..0000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland                  kernel
-src/include/ceph_fs.h      fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc     fs/ceph/ceph_fs.c
-src/include/msgr.h         fs/ceph/msgr.h
-src/include/rados.h        fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h            fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
-src/include/ceph_hash.h            fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
-src/crush/crush.c          fs/ceph/crush/crush.c
-src/crush/crush.h          fs/ceph/crush/crush.h
-src/crush/mapper.c         fs/ceph/crush/mapper.c
-src/crush/mapper.h         fs/ceph/crush/mapper.h
-src/crush/hash.h           fs/ceph/crush/hash.h
-src/crush/hash.c           fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c

index efbc604..51bcc5c 100644 (file)
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/backing-dev.h>
  #include <linux/fs.h>
@@ -10,7 +10,8 @@
  #include <linux/task_io_accounting_ops.h>
  
  #include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
  
  /*
   * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
  {
         struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       struct ceph_osd_client *osdc = 
+               &ceph_inode_to_client(inode)->client->osdc;
         int err = 0;
         u64 len = PAGE_CACHE_SIZE;
  
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_inode_to_client(inode)->client->osdc;
         int rc = 0;
         struct page **pages;
         loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
  {
         struct inode *inode;
         struct ceph_inode_info *ci;
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         struct ceph_osd_client *osdc;
         loff_t page_off = page->index << PAGE_CACHE_SHIFT;
         int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
         }
         inode = page->mapping->host;
         ci = ceph_inode(inode);
-       client = ceph_inode_to_client(inode);
-       osdc = &client->osdc;
+       fsc = ceph_inode_to_client(inode);
+       osdc = &fsc->client->osdc;
  
         /* verify this is a writeable snap context */
         snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
         dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
              inode, page, page->index, page_off, len, snapc);
  
-       writeback_stat = atomic_long_inc_return(&client->writeback_count);
+       writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
         if (writeback_stat >
-           CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
-               set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+           CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+               set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
  
         set_page_writeback(page);
         err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
         struct address_space *mapping = inode->i_mapping;
         __s32 rc = -EIO;
         u64 bytes = 0;
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         long writeback_stat;
         unsigned issued = ceph_caps_issued(ci);
  
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
                 WARN_ON(!PageUptodate(page));
  
                 writeback_stat =
-                       atomic_long_dec_return(&client->writeback_count);
+                       atomic_long_dec_return(&fsc->writeback_count);
                 if (writeback_stat <
-                   CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
-                       clear_bdi_congested(&client->backing_dev_info,
+                   CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+                       clear_bdi_congested(&fsc->backing_dev_info,
                                             BLK_RW_ASYNC);
  
                 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
   * mempool.  we avoid the mempool if we can because req->r_num_pages
   * may be less than the maximum write size.
   */
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
                            struct ceph_osd_request *req)
  {
         req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
                                GFP_NOFS);
         if (!req->r_pages) {
-               req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+               req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
                 req->r_pages_from_pool = 1;
                 WARN_ON(!req->r_pages);
         }
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
         struct inode *inode = mapping->host;
         struct backing_dev_info *bdi = mapping->backing_dev_info;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         pgoff_t index, start, end;
         int range_whole = 0;
         int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
              wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
              (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
  
-       client = ceph_inode_to_client(inode);
-       if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+       fsc = ceph_inode_to_client(inode);
+       if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
                 pr_warning("writepage_start %p on forced umount\n", inode);
                 return -EIO; /* we're in a forced umount, don't write! */
         }
-       if (client->mount_args->wsize && client->mount_args->wsize < wsize)
-               wsize = client->mount_args->wsize;
+       if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+               wsize = fsc->mount_options->wsize;
         if (wsize < PAGE_CACHE_SIZE)
                 wsize = PAGE_CACHE_SIZE;
         max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
                                 offset = (unsigned long long)page->index
                                         << PAGE_CACHE_SHIFT;
                                 len = wsize;
-                               req = ceph_osdc_new_request(&client->osdc,
+                               req = ceph_osdc_new_request(&fsc->client->osdc,
                                             &ci->i_layout,
                                             ceph_vino(inode),
                                             offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
                                             &inode->i_mtime, true, 1);
                                 max_pages = req->r_num_pages;
  
-                               alloc_page_vec(client, req);
+                               alloc_page_vec(fsc, req);
                                 req->r_callback = writepages_finish;
                                 req->r_inode = inode;
                         }
@@ -794,10 +797,10 @@ get_more_pages:
                              inode, page, page->index);
  
                         writeback_stat =
-                              atomic_long_inc_return(&client->writeback_count);
+                              atomic_long_inc_return(&fsc->writeback_count);
                         if (writeback_stat > CONGESTION_ON_THRESH(
-                                   client->mount_args->congestion_kb)) {
-                               set_bdi_congested(&client->backing_dev_info,
+                                   fsc->mount_options->congestion_kb)) {
+                               set_bdi_congested(&fsc->backing_dev_info,
                                                   BLK_RW_ASYNC);
                         }
  
@@ -846,7 +849,7 @@ get_more_pages:
                 op->payload_len = cpu_to_le32(len);
                 req->r_request->hdr.data_len = cpu_to_le32(len);
  
-               ceph_osdc_start_request(&client->osdc, req, true);
+               ceph_osdc_start_request(&fsc->client->osdc, req, true);
                 req = NULL;
  
                 /* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         loff_t page_off = pos & PAGE_CACHE_MASK;
         int pos_in_page = pos & ~PAGE_CACHE_MASK;
         int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
                           struct page *page, void *fsdata)
  {
         struct inode *inode = file->f_dentry->d_inode;
-       struct ceph_client *client = ceph_inode_to_client(inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
         int check_cap = 0;
  
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  {
         struct inode *inode = vma->vm_file->f_dentry->d_inode;
         struct page *page = vmf->page;
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         loff_t off = page->index << PAGE_CACHE_SHIFT;
         loff_t size, len;
         int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c

deleted file mode 100644 (file)

index eb2a666..0000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
-
-#include <linux/errno.h>
-
-int ceph_armor(char *dst, const char *src, const char *end);
-int ceph_unarmor(char *dst, const char *src, const char *end);
-
-/*
- * base64 encode/decode.
- */
-
-static const char *pem_key =
-       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-static int encode_bits(int c)
-{
-       return pem_key[c];
-}
-
-static int decode_bits(char c)
-{
-       if (c >= 'A' && c <= 'Z')
-               return c - 'A';
-       if (c >= 'a' && c <= 'z')
-               return c - 'a' + 26;
-       if (c >= '0' && c <= '9')
-               return c - '0' + 52;
-       if (c == '+')
-               return 62;
-       if (c == '/')
-               return 63;
-       if (c == '=')
-               return 0; /* just non-negative, please */
-       return -EINVAL;
-}
-
-int ceph_armor(char *dst, const char *src, const char *end)
-{
-       int olen = 0;
-       int line = 0;
-
-       while (src < end) {
-               unsigned char a, b, c;
-
-               a = *src++;
-               *dst++ = encode_bits(a >> 2);
-               if (src < end) {
-                       b = *src++;
-                       *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
-                       if (src < end) {
-                               c = *src++;
-                               *dst++ = encode_bits(((b & 15) << 2) |
-                                                    (c >> 6));
-                               *dst++ = encode_bits(c & 63);
-                       } else {
-                               *dst++ = encode_bits((b & 15) << 2);
-                               *dst++ = '=';
-                       }
-               } else {
-                       *dst++ = encode_bits(((a & 3) << 4));
-                       *dst++ = '=';
-                       *dst++ = '=';
-               }
-               olen += 4;
-               line += 4;
-               if (line == 64) {
-                       line = 0;
-                       *(dst++) = '\n';
-                       olen++;
-               }
-       }
-       return olen;
-}
-
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
-       int olen = 0;
-
-       while (src < end) {
-               int a, b, c, d;
-
-               if (src < end && src[0] == '\n')
-                       src++;
-               if (src + 4 > end)
-                       return -EINVAL;
-               a = decode_bits(src[0]);
-               b = decode_bits(src[1]);
-               c = decode_bits(src[2]);
-               d = decode_bits(src[3]);
-               if (a < 0 || b < 0 || c < 0 || d < 0)
-                       return -EINVAL;
-
-               *dst++ = (a << 2) | (b >> 4);
-               if (src[2] == '=')
-                       return olen + 1;
-               *dst++ = ((b & 15) << 4) | (c >> 2);
-               if (src[3] == '=')
-                       return olen + 2;
-               *dst++ = ((c & 3) << 6) | d;
-               olen += 3;
-               src += 4;
-       }
-       return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c

deleted file mode 100644 (file)

index 6d2e306..0000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-
-#include "messenger.h"
-
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
-       CEPH_AUTH_NONE,
-       CEPH_AUTH_CEPHX
-};
-
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
-       switch (protocol) {
-       case CEPH_AUTH_NONE:
-               return ceph_auth_none_init(ac);
-       case CEPH_AUTH_CEPHX:
-               return ceph_x_init(ac);
-       default:
-               return -ENOENT;
-       }
-}
-
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
-       struct ceph_auth_client *ac;
-       int ret;
-
-       dout("auth_init name '%s' secret '%s'\n", name, secret);
-
-       ret = -ENOMEM;
-       ac = kzalloc(sizeof(*ac), GFP_NOFS);
-       if (!ac)
-               goto out;
-
-       ac->negotiating = true;
-       if (name)
-               ac->name = name;
-       else
-               ac->name = CEPH_AUTH_NAME_DEFAULT;
-       dout("auth_init name %s secret %s\n", ac->name, secret);
-       ac->secret = secret;
-       return ac;
-
-out:
-       return ERR_PTR(ret);
-}
-
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
-       dout("auth_destroy %p\n", ac);
-       if (ac->ops)
-               ac->ops->destroy(ac);
-       kfree(ac);
-}
-
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
-       dout("auth_reset %p\n", ac);
-       if (ac->ops && !ac->negotiating)
-               ac->ops->reset(ac);
-       ac->negotiating = true;
-}
-
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
-       int len = strlen(name);
-
-       if (*p + 2*sizeof(u32) + len > end)
-               return -ERANGE;
-       ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
-       ceph_encode_32(p, len);
-       ceph_encode_copy(p, name, len);
-       return 0;
-}
-
-/*
- * Initiate protocol negotiation with monitor.  Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
-       struct ceph_mon_request_header *monhdr = buf;
-       void *p = monhdr + 1, *end = buf + len, *lenp;
-       int i, num;
-       int ret;
-
-       dout("auth_build_hello\n");
-       monhdr->have_version = 0;
-       monhdr->session_mon = cpu_to_le16(-1);
-       monhdr->session_mon_tid = 0;
-
-       ceph_encode_32(&p, 0);  /* no protocol, yet */
-
-       lenp = p;
-       p += sizeof(u32);
-
-       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-       ceph_encode_8(&p, 1);
-       num = ARRAY_SIZE(supported_protocols);
-       ceph_encode_32(&p, num);
-       ceph_decode_need(&p, end, num * sizeof(u32), bad);
-       for (i = 0; i < num; i++)
-               ceph_encode_32(&p, supported_protocols[i]);
-
-       ret = ceph_entity_name_encode(ac->name, &p, end);
-       if (ret < 0)
-               return ret;
-       ceph_decode_need(&p, end, sizeof(u64), bad);
-       ceph_encode_64(&p, ac->global_id);
-
-       ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-       return p - buf;
-
-bad:
-       return -ERANGE;
-}
-
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-                                  void *msg_buf, size_t msg_len)
-{
-       struct ceph_mon_request_header *monhdr = msg_buf;
-       void *p = monhdr + 1;
-       void *end = msg_buf + msg_len;
-       int ret;
-
-       monhdr->have_version = 0;
-       monhdr->session_mon = cpu_to_le16(-1);
-       monhdr->session_mon_tid = 0;
-
-       ceph_encode_32(&p, ac->protocol);
-
-       ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-       if (ret < 0) {
-               pr_err("error %d building auth method %s request\n", ret,
-                      ac->ops->name);
-               return ret;
-       }
-       dout(" built request %d bytes\n", ret);
-       ceph_encode_32(&p, ret);
-       return p + ret - msg_buf;
-}
-
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                          void *buf, size_t len,
-                          void *reply_buf, size_t reply_len)
-{
-       void *p = buf;
-       void *end = buf + len;
-       int protocol;
-       s32 result;
-       u64 global_id;
-       void *payload, *payload_end;
-       int payload_len;
-       char *result_msg;
-       int result_msg_len;
-       int ret = -EINVAL;
-
-       dout("handle_auth_reply %p %p\n", p, end);
-       ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
-       protocol = ceph_decode_32(&p);
-       result = ceph_decode_32(&p);
-       global_id = ceph_decode_64(&p);
-       payload_len = ceph_decode_32(&p);
-       payload = p;
-       p += payload_len;
-       ceph_decode_need(&p, end, sizeof(u32), bad);
-       result_msg_len = ceph_decode_32(&p);
-       result_msg = p;
-       p += result_msg_len;
-       if (p != end)
-               goto bad;
-
-       dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
-            result_msg, global_id, payload_len);
-
-       payload_end = payload + payload_len;
-
-       if (global_id && ac->global_id != global_id) {
-               dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
-               ac->global_id = global_id;
-       }
-
-       if (ac->negotiating) {
-               /* server does not support our protocols? */
-               if (!protocol && result < 0) {
-                       ret = result;
-                       goto out;
-               }
-               /* set up (new) protocol handler? */
-               if (ac->protocol && ac->protocol != protocol) {
-                       ac->ops->destroy(ac);
-                       ac->protocol = 0;
-                       ac->ops = NULL;
-               }
-               if (ac->protocol != protocol) {
-                       ret = ceph_auth_init_protocol(ac, protocol);
-                       if (ret) {
-                               pr_err("error %d on auth protocol %d init\n",
-                                      ret, protocol);
-                               goto out;
-                       }
-               }
-
-               ac->negotiating = false;
-       }
-
-       ret = ac->ops->handle_reply(ac, result, payload, payload_end);
-       if (ret == -EAGAIN) {
-               return ceph_build_auth_request(ac, reply_buf, reply_len);
-       } else if (ret) {
-               pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-               return ret;
-       }
-       return 0;
-
-bad:
-       pr_err("failed to decode auth msg\n");
-out:
-       return ret;
-}
-
-int ceph_build_auth(struct ceph_auth_client *ac,
-                   void *msg_buf, size_t msg_len)
-{
-       if (!ac->protocol)
-               return ceph_auth_build_hello(ac, msg_buf, msg_len);
-       BUG_ON(!ac->ops);
-       if (ac->ops->should_authenticate(ac))
-               return ceph_build_auth_request(ac, msg_buf, msg_len);
-       return 0;
-}
-
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
-       if (!ac->ops)
-               return 0;
-       return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h

deleted file mode 100644 (file)

index d38a2fb..0000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys.  These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-
-struct ceph_auth_client;
-struct ceph_authorizer;
-
-struct ceph_auth_client_ops {
-       const char *name;
-
-       /*
-        * true if we are authenticated and can connect to
-        * services.
-        */
-       int (*is_authenticated)(struct ceph_auth_client *ac);
-
-       /*
-        * true if we should (re)authenticate, e.g., when our tickets
-        * are getting old and crusty.
-        */
-       int (*should_authenticate)(struct ceph_auth_client *ac);
-
-       /*
-        * build requests and process replies during monitor
-        * handshake.  if handle_reply returns -EAGAIN, we build
-        * another request.
-        */
-       int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
-       int (*handle_reply)(struct ceph_auth_client *ac, int result,
-                           void *buf, void *end);
-
-       /*
-        * Create authorizer for connecting to a service, and verify
-        * the response to authenticate the service.
-        */
-       int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
-                                struct ceph_authorizer **a,
-                                void **buf, size_t *len,
-                                void **reply_buf, size_t *reply_len);
-       int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-                                      struct ceph_authorizer *a, size_t len);
-       void (*destroy_authorizer)(struct ceph_auth_client *ac,
-                                  struct ceph_authorizer *a);
-       void (*invalidate_authorizer)(struct ceph_auth_client *ac,
-                                     int peer_type);
-
-       /* reset when we (re)connect to a monitor */
-       void (*reset)(struct ceph_auth_client *ac);
-
-       void (*destroy)(struct ceph_auth_client *ac);
-};
-
-struct ceph_auth_client {
-       u32 protocol;           /* CEPH_AUTH_* */
-       void *private;          /* for use by protocol implementation */
-       const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
-
-       bool negotiating;       /* true if negotiating protocol */
-       const char *name;       /* entity name */
-       u64 global_id;          /* our unique id in system */
-       const char *secret;     /* our secret key */
-       unsigned want_keys;     /* which services we want */
-};
-
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-                                              const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
-                                void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
-                                 void *buf, size_t len,
-                                 void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-
-extern int ceph_build_auth(struct ceph_auth_client *ac,
-                   void *msg_buf, size_t msg_len);
-
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c

deleted file mode 100644 (file)

index ad1dc21..0000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-
-static void reset(struct ceph_auth_client *ac)
-{
-       struct ceph_auth_none_info *xi = ac->private;
-
-       xi->starting = true;
-       xi->built_authorizer = false;
-}
-
-static void destroy(struct ceph_auth_client *ac)
-{
-       kfree(ac->private);
-       ac->private = NULL;
-}
-
-static int is_authenticated(struct ceph_auth_client *ac)
-{
-       struct ceph_auth_none_info *xi = ac->private;
-
-       return !xi->starting;
-}
-
-static int should_authenticate(struct ceph_auth_client *ac)
-{
-       struct ceph_auth_none_info *xi = ac->private;
-
-       return xi->starting;
-}
-
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
-                       void *buf, void *end)
-{
-       struct ceph_auth_none_info *xi = ac->private;
-
-       xi->starting = false;
-       return result;
-}
-
-/*
- * build an 'authorizer' with our entity_name and global_id.  we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
-       struct ceph_auth_client *ac, int peer_type,
-       struct ceph_authorizer **a,
-       void **buf, size_t *len,
-       void **reply_buf, size_t *reply_len)
-{
-       struct ceph_auth_none_info *ai = ac->private;
-       struct ceph_none_authorizer *au = &ai->au;
-       void *p, *end;
-       int ret;
-
-       if (!ai->built_authorizer) {
-               p = au->buf;
-               end = p + sizeof(au->buf);
-               ceph_encode_8(&p, 1);
-               ret = ceph_entity_name_encode(ac->name, &p, end - 8);
-               if (ret < 0)
-                       goto bad;
-               ceph_decode_need(&p, end, sizeof(u64), bad2);
-               ceph_encode_64(&p, ac->global_id);
-               au->buf_len = p - (void *)au->buf;
-               ai->built_authorizer = true;
-               dout("built authorizer len %d\n", au->buf_len);
-       }
-
-       *a = (struct ceph_authorizer *)au;
-       *buf = au->buf;
-       *len = au->buf_len;
-       *reply_buf = au->reply_buf;
-       *reply_len = sizeof(au->reply_buf);
-       return 0;
-
-bad2:
-       ret = -ERANGE;
-bad:
-       return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
-                                     struct ceph_authorizer *a)
-{
-       /* nothing to do */
-}
-
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-       .name = "none",
-       .reset = reset,
-       .destroy = destroy,
-       .is_authenticated = is_authenticated,
-       .should_authenticate = should_authenticate,
-       .handle_reply = handle_reply,
-       .create_authorizer = ceph_auth_none_create_authorizer,
-       .destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
-       struct ceph_auth_none_info *xi;
-
-       dout("ceph_auth_none_init %p\n", ac);
-       xi = kzalloc(sizeof(*xi), GFP_NOFS);
-       if (!xi)
-               return -ENOMEM;
-
-       xi->starting = true;
-       xi->built_authorizer = false;
-
-       ac->protocol = CEPH_AUTH_NONE;
-       ac->private = xi;
-       ac->ops = &ceph_auth_none_ops;
-       return 0;
-}
-
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h

deleted file mode 100644 (file)

index 8164df1..0000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-
-#include <linux/slab.h>
-
-#include "auth.h"
-
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-
-struct ceph_none_authorizer {
-       char buf[128];
-       int buf_len;
-       char reply_buf[0];
-};
-
-struct ceph_auth_none_info {
-       bool starting;
-       bool built_authorizer;
-       struct ceph_none_authorizer au;   /* we only need one; it's static */
-};
-
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c

deleted file mode 100644 (file)

index a2d002c..0000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-
-#define TEMP_TICKET_BUF_LEN    256
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
-       struct ceph_x_info *xi = ac->private;
-       int need;
-
-       ceph_x_validate_tickets(ac, &need);
-       dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-            ac->want_keys, need, xi->have_keys);
-       return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
-       struct ceph_x_info *xi = ac->private;
-       int need;
-
-       ceph_x_validate_tickets(ac, &need);
-       dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-            ac->want_keys, need, xi->have_keys);
-       return need != 0;
-}
-
-static int ceph_x_encrypt_buflen(int ilen)
-{
-       return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
-               sizeof(u32);
-}
-
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
-                         void *ibuf, int ilen, void *obuf, size_t olen)
-{
-       struct ceph_x_encrypt_header head = {
-               .struct_v = 1,
-               .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
-       };
-       size_t len = olen - sizeof(u32);
-       int ret;
-
-       ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
-                           &head, sizeof(head), ibuf, ilen);
-       if (ret)
-               return ret;
-       ceph_encode_32(&obuf, len);
-       return len + sizeof(u32);
-}
-
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
-                         void **p, void *end, void *obuf, size_t olen)
-{
-       struct ceph_x_encrypt_header head;
-       size_t head_len = sizeof(head);
-       int len, ret;
-
-       len = ceph_decode_32(p);
-       if (*p + len > end)
-               return -EINVAL;
-
-       dout("ceph_x_decrypt len %d\n", len);
-       ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
-                           *p, len);
-       if (ret)
-               return ret;
-       if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
-               return -EPERM;
-       *p += len;
-       return olen;
-}
-
-/*
- * get existing (or insert new) ticket handler
- */
-static struct ceph_x_ticket_handler *
-get_ticket_handler(struct ceph_auth_client *ac, int service)
-{
-       struct ceph_x_ticket_handler *th;
-       struct ceph_x_info *xi = ac->private;
-       struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-
-       while (*p) {
-               parent = *p;
-               th = rb_entry(parent, struct ceph_x_ticket_handler, node);
-               if (service < th->service)
-                       p = &(*p)->rb_left;
-               else if (service > th->service)
-                       p = &(*p)->rb_right;
-               else
-                       return th;
-       }
-
-       /* add it */
-       th = kzalloc(sizeof(*th), GFP_NOFS);
-       if (!th)
-               return ERR_PTR(-ENOMEM);
-       th->service = service;
-       rb_link_node(&th->node, parent, p);
-       rb_insert_color(&th->node, &xi->ticket_handlers);
-       return th;
-}
-
-static void remove_ticket_handler(struct ceph_auth_client *ac,
-                                 struct ceph_x_ticket_handler *th)
-{
-       struct ceph_x_info *xi = ac->private;
-
-       dout("remove_ticket_handler %p %d\n", th, th->service);
-       rb_erase(&th->node, &xi->ticket_handlers);
-       ceph_crypto_key_destroy(&th->session_key);
-       if (th->ticket_blob)
-               ceph_buffer_put(th->ticket_blob);
-       kfree(th);
-}
-
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
-                                   struct ceph_crypto_key *secret,
-                                   void *buf, void *end)
-{
-       struct ceph_x_info *xi = ac->private;
-       int num;
-       void *p = buf;
-       int ret;
-       char *dbuf;
-       char *ticket_buf;
-       u8 reply_struct_v;
-
-       dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-       if (!dbuf)
-               return -ENOMEM;
-
-       ret = -ENOMEM;
-       ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-       if (!ticket_buf)
-               goto out_dbuf;
-
-       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-       reply_struct_v = ceph_decode_8(&p);
-       if (reply_struct_v != 1)
-               goto bad;
-       num = ceph_decode_32(&p);
-       dout("%d tickets\n", num);
-       while (num--) {
-               int type;
-               u8 tkt_struct_v, blob_struct_v;
-               struct ceph_x_ticket_handler *th;
-               void *dp, *dend;
-               int dlen;
-               char is_enc;
-               struct timespec validity;
-               struct ceph_crypto_key old_key;
-               void *tp, *tpend;
-               struct ceph_timespec new_validity;
-               struct ceph_crypto_key new_session_key;
-               struct ceph_buffer *new_ticket_blob;
-               unsigned long new_expires, new_renew_after;
-               u64 new_secret_id;
-
-               ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-
-               type = ceph_decode_32(&p);
-               dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-
-               tkt_struct_v = ceph_decode_8(&p);
-               if (tkt_struct_v != 1)
-                       goto bad;
-
-               th = get_ticket_handler(ac, type);
-               if (IS_ERR(th)) {
-                       ret = PTR_ERR(th);
-                       goto out;
-               }
-
-               /* blob for me */
-               dlen = ceph_x_decrypt(secret, &p, end, dbuf,
-                                     TEMP_TICKET_BUF_LEN);
-               if (dlen <= 0) {
-                       ret = dlen;
-                       goto out;
-               }
-               dout(" decrypted %d bytes\n", dlen);
-               dend = dbuf + dlen;
-               dp = dbuf;
-
-               tkt_struct_v = ceph_decode_8(&dp);
-               if (tkt_struct_v != 1)
-                       goto bad;
-
-               memcpy(&old_key, &th->session_key, sizeof(old_key));
-               ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
-               if (ret)
-                       goto out;
-
-               ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
-               ceph_decode_timespec(&validity, &new_validity);
-               new_expires = get_seconds() + validity.tv_sec;
-               new_renew_after = new_expires - (validity.tv_sec / 4);
-               dout(" expires=%lu renew_after=%lu\n", new_expires,
-                    new_renew_after);
-
-               /* ticket blob for service */
-               ceph_decode_8_safe(&p, end, is_enc, bad);
-               tp = ticket_buf;
-               if (is_enc) {
-                       /* encrypted */
-                       dout(" encrypted ticket\n");
-                       dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
-                                             TEMP_TICKET_BUF_LEN);
-                       if (dlen < 0) {
-                               ret = dlen;
-                               goto out;
-                       }
-                       dlen = ceph_decode_32(&tp);
-               } else {
-                       /* unencrypted */
-                       ceph_decode_32_safe(&p, end, dlen, bad);
-                       ceph_decode_need(&p, end, dlen, bad);
-                       ceph_decode_copy(&p, ticket_buf, dlen);
-               }
-               tpend = tp + dlen;
-               dout(" ticket blob is %d bytes\n", dlen);
-               ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-               blob_struct_v = ceph_decode_8(&tp);
-               new_secret_id = ceph_decode_64(&tp);
-               ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
-               if (ret)
-                       goto out;
-
-               /* all is well, update our ticket */
-               ceph_crypto_key_destroy(&th->session_key);
-               if (th->ticket_blob)
-                       ceph_buffer_put(th->ticket_blob);
-               th->session_key = new_session_key;
-               th->ticket_blob = new_ticket_blob;
-               th->validity = new_validity;
-               th->secret_id = new_secret_id;
-               th->expires = new_expires;
-               th->renew_after = new_renew_after;
-               dout(" got ticket service %d (%s) secret_id %lld len %d\n",
-                    type, ceph_entity_type_name(type), th->secret_id,
-                    (int)th->ticket_blob->vec.iov_len);
-               xi->have_keys |= th->service;
-       }
-
-       ret = 0;
-out:
-       kfree(ticket_buf);
-out_dbuf:
-       kfree(dbuf);
-       return ret;
-
-bad:
-       ret = -EINVAL;
-       goto out;
-}
-
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
-                                  struct ceph_x_ticket_handler *th,
-                                  struct ceph_x_authorizer *au)
-{
-       int maxlen;
-       struct ceph_x_authorize_a *msg_a;
-       struct ceph_x_authorize_b msg_b;
-       void *p, *end;
-       int ret;
-       int ticket_blob_len =
-               (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-
-       dout("build_authorizer for %s %p\n",
-            ceph_entity_type_name(th->service), au);
-
-       maxlen = sizeof(*msg_a) + sizeof(msg_b) +
-               ceph_x_encrypt_buflen(ticket_blob_len);
-       dout("  need len %d\n", maxlen);
-       if (au->buf && au->buf->alloc_len < maxlen) {
-               ceph_buffer_put(au->buf);
-               au->buf = NULL;
-       }
-       if (!au->buf) {
-               au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
-               if (!au->buf)
-                       return -ENOMEM;
-       }
-       au->service = th->service;
-
-       msg_a = au->buf->vec.iov_base;
-       msg_a->struct_v = 1;
-       msg_a->global_id = cpu_to_le64(ac->global_id);
-       msg_a->service_id = cpu_to_le32(th->service);
-       msg_a->ticket_blob.struct_v = 1;
-       msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
-       msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
-       if (ticket_blob_len) {
-               memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
-                      th->ticket_blob->vec.iov_len);
-       }
-       dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
-            le64_to_cpu(msg_a->ticket_blob.secret_id));
-
-       p = msg_a + 1;
-       p += ticket_blob_len;
-       end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-
-       get_random_bytes(&au->nonce, sizeof(au->nonce));
-       msg_b.struct_v = 1;
-       msg_b.nonce = cpu_to_le64(au->nonce);
-       ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
-                            p, end - p);
-       if (ret < 0)
-               goto out_buf;
-       p += ret;
-       au->buf->vec.iov_len = p - au->buf->vec.iov_base;
-       dout(" built authorizer nonce %llx len %d\n", au->nonce,
-            (int)au->buf->vec.iov_len);
-       BUG_ON(au->buf->vec.iov_len > maxlen);
-       return 0;
-
-out_buf:
-       ceph_buffer_put(au->buf);
-       au->buf = NULL;
-       return ret;
-}
-
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
-                               void **p, void *end)
-{
-       ceph_decode_need(p, end, 1 + sizeof(u64), bad);
-       ceph_encode_8(p, 1);
-       ceph_encode_64(p, th->secret_id);
-       if (th->ticket_blob) {
-               const char *buf = th->ticket_blob->vec.iov_base;
-               u32 len = th->ticket_blob->vec.iov_len;
-
-               ceph_encode_32_safe(p, end, len, bad);
-               ceph_encode_copy_safe(p, end, buf, len, bad);
-       } else {
-               ceph_encode_32_safe(p, end, 0, bad);
-       }
-
-       return 0;
-bad:
-       return -ERANGE;
-}
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
-       int want = ac->want_keys;
-       struct ceph_x_info *xi = ac->private;
-       int service;
-
-       *pneed = ac->want_keys & ~(xi->have_keys);
-
-       for (service = 1; service <= want; service <<= 1) {
-               struct ceph_x_ticket_handler *th;
-
-               if (!(ac->want_keys & service))
-                       continue;
-
-               if (*pneed & service)
-                       continue;
-
-               th = get_ticket_handler(ac, service);
-
-               if (IS_ERR(th)) {
-                       *pneed |= service;
-                       continue;
-               }
-
-               if (get_seconds() >= th->renew_after)
-                       *pneed |= service;
-               if (get_seconds() >= th->expires)
-                       xi->have_keys &= ~service;
-       }
-}
-
-
-static int ceph_x_build_request(struct ceph_auth_client *ac,
-                               void *buf, void *end)
-{
-       struct ceph_x_info *xi = ac->private;
-       int need;
-       struct ceph_x_request_header *head = buf;
-       int ret;
-       struct ceph_x_ticket_handler *th =
-               get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-
-       if (IS_ERR(th))
-               return PTR_ERR(th);
-
-       ceph_x_validate_tickets(ac, &need);
-
-       dout("build_request want %x have %x need %x\n",
-            ac->want_keys, xi->have_keys, need);
-
-       if (need & CEPH_ENTITY_TYPE_AUTH) {
-               struct ceph_x_authenticate *auth = (void *)(head + 1);
-               void *p = auth + 1;
-               struct ceph_x_challenge_blob tmp;
-               char tmp_enc[40];
-               u64 *u;
-
-               if (p > end)
-                       return -ERANGE;
-
-               dout(" get_auth_session_key\n");
-               head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-
-               /* encrypt and hash */
-               get_random_bytes(&auth->client_challenge, sizeof(u64));
-               tmp.client_challenge = auth->client_challenge;
-               tmp.server_challenge = cpu_to_le64(xi->server_challenge);
-               ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
-                                    tmp_enc, sizeof(tmp_enc));
-               if (ret < 0)
-                       return ret;
-
-               auth->struct_v = 1;
-               auth->key = 0;
-               for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-                       auth->key ^= *(__le64 *)u;
-               dout(" server_challenge %llx client_challenge %llx key %llx\n",
-                    xi->server_challenge, le64_to_cpu(auth->client_challenge),
-                    le64_to_cpu(auth->key));
-
-               /* now encode the old ticket if exists */
-               ret = ceph_x_encode_ticket(th, &p, end);
-               if (ret < 0)
-                       return ret;
-
-               return p - buf;
-       }
-
-       if (need) {
-               void *p = head + 1;
-               struct ceph_x_service_ticket_request *req;
-
-               if (p > end)
-                       return -ERANGE;
-               head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
-               ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
-               if (ret)
-                       return ret;
-               ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-                                xi->auth_authorizer.buf->vec.iov_len);
-
-               req = p;
-               req->keys = cpu_to_le32(need);
-               p += sizeof(*req);
-               return p - buf;
-       }
-
-       return 0;
-}
-
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-                              void *buf, void *end)
-{
-       struct ceph_x_info *xi = ac->private;
-       struct ceph_x_reply_header *head = buf;
-       struct ceph_x_ticket_handler *th;
-       int len = end - buf;
-       int op;
-       int ret;
-
-       if (result)
-               return result;  /* XXX hmm? */
-
-       if (xi->starting) {
-               /* it's a hello */
-               struct ceph_x_server_challenge *sc = buf;
-
-               if (len != sizeof(*sc))
-                       return -EINVAL;
-               xi->server_challenge = le64_to_cpu(sc->server_challenge);
-               dout("handle_reply got server challenge %llx\n",
-                    xi->server_challenge);
-               xi->starting = false;
-               xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
-               return -EAGAIN;
-       }
-
-       op = le16_to_cpu(head->op);
-       result = le32_to_cpu(head->result);
-       dout("handle_reply op %d result %d\n", op, result);
-       switch (op) {
-       case CEPHX_GET_AUTH_SESSION_KEY:
-               /* verify auth key */
-               ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-                                              buf + sizeof(*head), end);
-               break;
-
-       case CEPHX_GET_PRINCIPAL_SESSION_KEY:
-               th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-               if (IS_ERR(th))
-                       return PTR_ERR(th);
-               ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-                                              buf + sizeof(*head), end);
-               break;
-
-       default:
-               return -EINVAL;
-       }
-       if (ret)
-               return ret;
-       if (ac->want_keys == xi->have_keys)
-               return 0;
-       return -EAGAIN;
-}
-
-static int ceph_x_create_authorizer(
-       struct ceph_auth_client *ac, int peer_type,
-       struct ceph_authorizer **a,
-       void **buf, size_t *len,
-       void **reply_buf, size_t *reply_len)
-{
-       struct ceph_x_authorizer *au;
-       struct ceph_x_ticket_handler *th;
-       int ret;
-
-       th = get_ticket_handler(ac, peer_type);
-       if (IS_ERR(th))
-               return PTR_ERR(th);
-
-       au = kzalloc(sizeof(*au), GFP_NOFS);
-       if (!au)
-               return -ENOMEM;
-
-       ret = ceph_x_build_authorizer(ac, th, au);
-       if (ret) {
-               kfree(au);
-               return ret;
-       }
-
-       *a = (struct ceph_authorizer *)au;
-       *buf = au->buf->vec.iov_base;
-       *len = au->buf->vec.iov_len;
-       *reply_buf = au->reply_buf;
-       *reply_len = sizeof(au->reply_buf);
-       return 0;
-}
-
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-                                         struct ceph_authorizer *a, size_t len)
-{
-       struct ceph_x_authorizer *au = (void *)a;
-       struct ceph_x_ticket_handler *th;
-       int ret = 0;
-       struct ceph_x_authorize_reply reply;
-       void *p = au->reply_buf;
-       void *end = p + sizeof(au->reply_buf);
-
-       th = get_ticket_handler(ac, au->service);
-       if (IS_ERR(th))
-               return PTR_ERR(th);
-       ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
-       if (ret < 0)
-               return ret;
-       if (ret != sizeof(reply))
-               return -EPERM;
-
-       if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
-               ret = -EPERM;
-       else
-               ret = 0;
-       dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-            au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
-       return ret;
-}
-
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
-                                     struct ceph_authorizer *a)
-{
-       struct ceph_x_authorizer *au = (void *)a;
-
-       ceph_buffer_put(au->buf);
-       kfree(au);
-}
-
-
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
-       struct ceph_x_info *xi = ac->private;
-
-       dout("reset\n");
-       xi->starting = true;
-       xi->server_challenge = 0;
-}
-
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
-       struct ceph_x_info *xi = ac->private;
-       struct rb_node *p;
-
-       dout("ceph_x_destroy %p\n", ac);
-       ceph_crypto_key_destroy(&xi->secret);
-
-       while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
-               struct ceph_x_ticket_handler *th =
-                       rb_entry(p, struct ceph_x_ticket_handler, node);
-               remove_ticket_handler(ac, th);
-       }
-
-       if (xi->auth_authorizer.buf)
-               ceph_buffer_put(xi->auth_authorizer.buf);
-
-       kfree(ac->private);
-       ac->private = NULL;
-}
-
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
-                                  int peer_type)
-{
-       struct ceph_x_ticket_handler *th;
-
-       th = get_ticket_handler(ac, peer_type);
-       if (!IS_ERR(th))
-               remove_ticket_handler(ac, th);
-}
-
-
-static const struct ceph_auth_client_ops ceph_x_ops = {
-       .name = "x",
-       .is_authenticated = ceph_x_is_authenticated,
-       .should_authenticate = ceph_x_should_authenticate,
-       .build_request = ceph_x_build_request,
-       .handle_reply = ceph_x_handle_reply,
-       .create_authorizer = ceph_x_create_authorizer,
-       .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
-       .destroy_authorizer = ceph_x_destroy_authorizer,
-       .invalidate_authorizer = ceph_x_invalidate_authorizer,
-       .reset =  ceph_x_reset,
-       .destroy = ceph_x_destroy,
-};
-
-
-int ceph_x_init(struct ceph_auth_client *ac)
-{
-       struct ceph_x_info *xi;
-       int ret;
-
-       dout("ceph_x_init %p\n", ac);
-       ret = -ENOMEM;
-       xi = kzalloc(sizeof(*xi), GFP_NOFS);
-       if (!xi)
-               goto out;
-
-       ret = -EINVAL;
-       if (!ac->secret) {
-               pr_err("no secret set (for auth_x protocol)\n");
-               goto out_nomem;
-       }
-
-       ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
-       if (ret)
-               goto out_nomem;
-
-       xi->starting = true;
-       xi->ticket_handlers = RB_ROOT;
-
-       ac->protocol = CEPH_AUTH_CEPHX;
-       ac->private = xi;
-       ac->ops = &ceph_x_ops;
-       return 0;
-
-out_nomem:
-       kfree(xi);
-out:
-       return ret;
-}
-
-
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h

deleted file mode 100644 (file)

index ff6f818..0000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-
-#include <linux/rbtree.h>
-
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
-       struct rb_node node;
-       unsigned service;
-
-       struct ceph_crypto_key session_key;
-       struct ceph_timespec validity;
-
-       u64 secret_id;
-       struct ceph_buffer *ticket_blob;
-
-       unsigned long renew_after, expires;
-};
-
-
-struct ceph_x_authorizer {
-       struct ceph_buffer *buf;
-       unsigned service;
-       u64 nonce;
-       char reply_buf[128];  /* big enough for encrypted blob */
-};
-
-struct ceph_x_info {
-       struct ceph_crypto_key secret;
-
-       bool starting;
-       u64 server_challenge;
-
-       unsigned have_keys;
-       struct rb_root ticket_handlers;
-
-       struct ceph_x_authorizer auth_authorizer;
-};
-
-extern int ceph_x_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h

deleted file mode 100644 (file)

index 671d305..0000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-
-#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY          0x0400
-
-/* common bits */
-struct ceph_x_ticket_blob {
-       __u8 struct_v;
-       __le64 secret_id;
-       __le32 blob_len;
-       char blob[];
-} __attribute__ ((packed));
-
-
-/* common request/reply headers */
-struct ceph_x_request_header {
-       __le16 op;
-} __attribute__ ((packed));
-
-struct ceph_x_reply_header {
-       __le16 op;
-       __le32 result;
-} __attribute__ ((packed));
-
-
-/* authenticate handshake */
-
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
-       __u8 struct_v;
-       __le64 server_challenge;
-} __attribute__ ((packed));
-
-struct ceph_x_authenticate {
-       __u8 struct_v;
-       __le64 client_challenge;
-       __le64 key;
-       /* ticket blob */
-} __attribute__ ((packed));
-
-struct ceph_x_service_ticket_request {
-       __u8 struct_v;
-       __le32 keys;
-} __attribute__ ((packed));
-
-struct ceph_x_challenge_blob {
-       __le64 server_challenge;
-       __le64 client_challenge;
-} __attribute__ ((packed));
-
-
-
-/* authorize handshake */
-
-/*
- * The authorizer consists of two pieces:
- *  a - service id, ticket blob
- *  b - encrypted with session key
- */
-struct ceph_x_authorize_a {
-       __u8 struct_v;
-       __le64 global_id;
-       __le32 service_id;
-       struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_b {
-       __u8 struct_v;
-       __le64 nonce;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_reply {
-       __u8 struct_v;
-       __le64 nonce_plus_one;
-} __attribute__ ((packed));
-
-
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-
-struct ceph_x_encrypt_header {
-       __u8 struct_v;
-       __le64 magic;
-} __attribute__ ((packed));
-
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c

deleted file mode 100644 (file)

index cd39f17..0000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-
-#include "buffer.h"
-#include "decode.h"
-
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
-       struct ceph_buffer *b;
-
-       b = kmalloc(sizeof(*b), gfp);
-       if (!b)
-               return NULL;
-
-       b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-       if (b->vec.iov_base) {
-               b->is_vmalloc = false;
-       } else {
-               b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-               if (!b->vec.iov_base) {
-                       kfree(b);
-                       return NULL;
-               }
-               b->is_vmalloc = true;
-       }
-
-       kref_init(&b->kref);
-       b->alloc_len = len;
-       b->vec.iov_len = len;
-       dout("buffer_new %p\n", b);
-       return b;
-}
-
-void ceph_buffer_release(struct kref *kref)
-{
-       struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-
-       dout("buffer_release %p\n", b);
-       if (b->vec.iov_base) {
-               if (b->is_vmalloc)
-                       vfree(b->vec.iov_base);
-               else
-                       kfree(b->vec.iov_base);
-       }
-       kfree(b);
-}
-
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
-       size_t len;
-
-       ceph_decode_need(p, end, sizeof(u32), bad);
-       len = ceph_decode_32(p);
-       dout("decode_buffer len %d\n", (int)len);
-       ceph_decode_need(p, end, len, bad);
-       *b = ceph_buffer_new(len, GFP_NOFS);
-       if (!*b)
-               return -ENOMEM;
-       ceph_decode_copy(p, (*b)->vec.iov_base, len);
-       return 0;
-bad:
-       return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h

deleted file mode 100644 (file)

index 58d1901..0000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
-       struct kref kref;
-       struct kvec vec;
-       size_t alloc_len;
-       bool is_vmalloc;
-};
-
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
-       kref_get(&b->kref);
-       return b;
-}
-
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
-       kref_put(&b->kref, ceph_buffer_release);
-}
-
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c

index 5e9da99..98ab13e 100644 (file)
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/fs.h>
  #include <linux/kernel.h>
@@ -9,8 +9,9 @@
  #include <linux/writeback.h>
  
  #include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
  
  /*
   * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
         spin_unlock(&mdsc->caps_list_lock);
  }
  
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
                              int *total, int *avail, int *used, int *reserved,
                              int *min)
  {
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
  
         if (total)
                 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
  static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
                                struct ceph_inode_info *ci)
  {
-       struct ceph_mount_args *ma = mdsc->client->mount_args;
+       struct ceph_mount_options *ma = mdsc->fsc->mount_options;
  
         ci->i_hold_caps_min = round_jiffies(jiffies +
                                             ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
                  unsigned seq, unsigned mseq, u64 realmino, int flags,
                  struct ceph_cap_reservation *caps_reservation)
  {
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct ceph_cap *new_cap = NULL;
         struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
         struct ceph_mds_session *session = cap->session;
         struct ceph_inode_info *ci = cap->ci;
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+               ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         int removed = 0;
  
         dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
         int mds;
         struct ceph_cap_snap *capsnap;
         u32 mseq;
-       struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
         struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
                                                     session->s_mutex */
         u64 next_follows = 0;  /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  {
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+               ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
         struct inode *inode = &ci->vfs_inode;
         int was = ci->i_dirty_caps;
         int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
  static int __mark_caps_flushing(struct inode *inode,
                                  struct ceph_mds_session *session)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int flushing;
  
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
  /*
   * try to invalidate mapping pages without blocking.
   */
-static int mapping_is_empty(struct address_space *mapping)
-{
-       struct page *page = find_get_page(mapping, 0);
-
-       if (!page)
-               return 1;
-
-       put_page(page);
-       return 0;
-}
-
  static int try_nonblocking_invalidate(struct inode *inode)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
         invalidate_mapping_pages(&inode->i_data, 0, -1);
         spin_lock(&inode->i_lock);
  
-       if (mapping_is_empty(&inode->i_data) &&
+       if (inode->i_data.nrpages == 0 &&
             invalidating_gen == ci->i_rdcache_gen) {
                 /* success. */
                 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
  void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                      struct ceph_mds_session *session)
  {
-       struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = &ci->vfs_inode;
         struct ceph_cap *cap;
         int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
          */
         if ((!is_delayed || mdsc->stopping) &&
             ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
-           ci->i_rdcache_gen &&                     /* may have cached pages */
+           inode->i_data.nrpages &&                 /* have cached pages */
             (file_wanted == 0 ||                     /* no open files */
              (revoking & (CEPH_CAP_FILE_CACHE|
                           CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
  static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                           unsigned *flush_tid)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_inode_info *ci = ceph_inode(inode);
         int unlock_session = session ? 0 : 1;
         int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                                        caps_are_flushed(inode, flush_tid));
         } else {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(inode->i_sb)->mdsc;
+                       ceph_sb_to_client(inode->i_sb)->mdsc;
  
                 spin_lock(&inode->i_lock);
                 if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
         __releases(inode->i_lock)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         unsigned seq = le32_to_cpu(m->seq);
         int dirty = le32_to_cpu(m->dirty);
         int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                       struct ceph_msg *msg)
  {
         struct ceph_mds_client *mdsc = session->s_mdsc;
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         struct inode *inode;
         struct ceph_cap *cap;
         struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h

deleted file mode 100644 (file)

index 1818c23..0000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-#  define dout(fmt, ...)                                               \
-       pr_debug(" %12.12s:%-4d : " fmt,                                \
-                ceph_file_part(__FILE__, sizeof(__FILE__)),            \
-                __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)       do {                            \
-               if (0)                                          \
-                       printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
-       } while (0)
-# endif
-
-#else
-
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...)        pr_debug(" " fmt, ##__VA_ARGS__)
-
-#endif
-
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c

index ab6cf35..bdce8b1 100644 (file)
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
  /*
   * Ceph 'frag' type
   */
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
  
  int ceph_frag_compare(__u32 a, __u32 b)
  {
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h

deleted file mode 100644 (file)

index 5babb8e..0000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef FS_CEPH_FRAG_H
-#define FS_CEPH_FRAG_H
-
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask.  Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- *   8 upper bits = "bits"
- *  24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value.  This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically.  However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
-       return (b << 24) |
-               (v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
-       return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
-       return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
-       return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
-       return 24 - ceph_frag_bits(f);
-}
-
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
-       return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
-       /* is sub as specific as us, and contained by us? */
-       return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
-              (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-
-static inline __u32 ceph_frag_parent(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f) - 1,
-                        ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
-       return ceph_frag_bits(f) > 0 &&
-               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
-       return ceph_frag_bits(f) > 0 &&
-               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f),
-                     ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f)+1,
-             ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
-       int newbits = ceph_frag_bits(f) + by;
-       return ceph_frag_make(newbits,
-                        ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
-       return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
-       return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
-       return ceph_frag_make(ceph_frag_bits(f),
-                        ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c

deleted file mode 100644 (file)

index 3ac6cc7..0000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
-       __u32 su = le32_to_cpu(layout->fl_stripe_unit);
-       __u32 sc = le32_to_cpu(layout->fl_stripe_count);
-       __u32 os = le32_to_cpu(layout->fl_object_size);
-
-       /* stripe unit, object size must be non-zero, 64k increment */
-       if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
-               return 0;
-       if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
-               return 0;
-       /* object size must be a multiple of stripe unit */
-       if (os < su || os % su)
-               return 0;
-       /* stripe count must be non-zero */
-       if (!sc)
-               return 0;
-       return 1;
-}
-
-
-int ceph_flags_to_mode(int flags)
-{
-       int mode;
-
-#ifdef O_DIRECTORY  /* fixme */
-       if ((flags & O_DIRECTORY) == O_DIRECTORY)
-               return CEPH_FILE_MODE_PIN;
-#endif
-       if ((flags & O_APPEND) == O_APPEND)
-               flags |= O_WRONLY;
-
-       if ((flags & O_ACCMODE) == O_RDWR)
-               mode = CEPH_FILE_MODE_RDWR;
-       else if ((flags & O_ACCMODE) == O_WRONLY)
-               mode = CEPH_FILE_MODE_WR;
-       else
-               mode = CEPH_FILE_MODE_RD;
-
-#ifdef O_LAZY
-       if (flags & O_LAZY)
-               mode |= CEPH_FILE_MODE_LAZY;
-#endif
-
-       return mode;
-}
-
-int ceph_caps_for_mode(int mode)
-{
-       int caps = CEPH_CAP_PIN;
-
-       if (mode & CEPH_FILE_MODE_RD)
-               caps |= CEPH_CAP_FILE_SHARED |
-                       CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-       if (mode & CEPH_FILE_MODE_WR)
-               caps |= CEPH_CAP_FILE_EXCL |
-                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-                       CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-                       CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-       if (mode & CEPH_FILE_MODE_LAZY)
-               caps |= CEPH_CAP_FILE_LAZYIO;
-
-       return caps;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h

deleted file mode 100644 (file)

index d5619ac..0000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-
-#ifndef CEPH_FS_H
-#define CEPH_FS_H
-
-#include "msgr.h"
-#include "rados.h"
-
-/*
- * subprotocol versions.  when specific messages types or high-level
- * protocols change, bump the affected components.  we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
-#define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   24 /* server/client */
-#define CEPH_MDSC_PROTOCOL   32 /* server/client */
-#define CEPH_MONC_PROTOCOL   15 /* server/client */
-
-
-#define CEPH_INO_ROOT  1
-#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
-
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON   31
-
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
-#define CEPH_FEATURE_FLOCK          (1<<3)
-
-
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
-       /* file -> object mapping */
-       __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
-                                     of page size. */
-       __le32 fl_stripe_count;    /* over this many objects */
-       __le32 fl_object_size;     /* until objects are this big, then move to
-                                     new objects */
-       __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
-
-       /* pg -> disk layout */
-       __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
-
-       /* object -> pg layout */
-       __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
-       __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-
-#define CEPH_MIN_STRIPE_UNIT 65536
-
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
-
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES  0x1
-
-#define CEPH_AES_IV "cephsageyudagreg"
-
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN      0x0
-#define CEPH_AUTH_NONE         0x1
-#define CEPH_AUTH_CEPHX                0x2
-
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-
-
-/*********************************************
- * message layer
- */
-
-/*
- * message types
- */
-
-/* misc */
-#define CEPH_MSG_SHUTDOWN               1
-#define CEPH_MSG_PING                   2
-
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP                4
-#define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_STATFS                 13
-#define CEPH_MSG_STATFS_REPLY           14
-#define CEPH_MSG_MON_SUBSCRIBE          15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
-#define CEPH_MSG_AUTH                  17
-#define CEPH_MSG_AUTH_REPLY            18
-
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP                21
-
-#define CEPH_MSG_CLIENT_SESSION         22
-#define CEPH_MSG_CLIENT_RECONNECT       23
-
-#define CEPH_MSG_CLIENT_REQUEST         24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY           26
-#define CEPH_MSG_CLIENT_CAPS            0x310
-#define CEPH_MSG_CLIENT_LEASE           0x311
-#define CEPH_MSG_CLIENT_SNAP            0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
-
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY           48
-#define CEPH_MSG_POOLOP                 49
-
-
-/* osd */
-#define CEPH_MSG_OSD_MAP          41
-#define CEPH_MSG_OSD_OP           42
-#define CEPH_MSG_OSD_OPREPLY      43
-
-/* pool operations */
-enum {
-  POOL_OP_CREATE                       = 0x01,
-  POOL_OP_DELETE                       = 0x02,
-  POOL_OP_AUID_CHANGE                  = 0x03,
-  POOL_OP_CREATE_SNAP                  = 0x11,
-  POOL_OP_DELETE_SNAP                  = 0x12,
-  POOL_OP_CREATE_UNMANAGED_SNAP                = 0x21,
-  POOL_OP_DELETE_UNMANAGED_SNAP                = 0x22,
-};
-
-struct ceph_mon_request_header {
-       __le64 have_version;
-       __le16 session_mon;
-       __le64 session_mon_tid;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs {
-       struct ceph_mon_request_header monhdr;
-       struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_statfs {
-       __le64 kb, kb_used, kb_avail;
-       __le64 num_objects;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs_reply {
-       struct ceph_fsid fsid;
-       __le64 version;
-       struct ceph_statfs st;
-} __attribute__ ((packed));
-
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
-       struct ceph_mon_request_header monhdr;
-       struct ceph_fsid fsid;
-       __le32 pool;
-       __le32 op;
-       __le64 auid;
-       __le64 snapid;
-       __le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
-       struct ceph_mon_request_header monhdr;
-       struct ceph_fsid fsid;
-       __le32 reply_code;
-       __le32 epoch;
-       char has_data;
-       char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
-       __le64 snapid;
-} __attribute__ ((packed));
-
-struct ceph_osd_getmap {
-       struct ceph_mon_request_header monhdr;
-       struct ceph_fsid fsid;
-       __le32 start;
-} __attribute__ ((packed));
-
-struct ceph_mds_getmap {
-       struct ceph_mon_request_header monhdr;
-       struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_client_mount {
-       struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_item {
-       __le64 have_version;    __le64 have;
-       __u8 onetime;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_ack {
-       __le32 duration;         /* seconds */
-       struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-/*
- * mds states
- *   > 0 -> in
- *  <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
-                                         empty log. */
-#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-
-#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
-                                         operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
-
-extern const char *ceph_mds_state_name(int s);
-
-
-/*
- * metadata lock types.
- *  - these are bitmasks.. we can compose them
- *  - they also define the lock ordering by the MDS
- *  - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_IXATTR      2048
-#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
-#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
-
-/* client_session ops */
-enum {
-       CEPH_SESSION_REQUEST_OPEN,
-       CEPH_SESSION_OPEN,
-       CEPH_SESSION_REQUEST_CLOSE,
-       CEPH_SESSION_CLOSE,
-       CEPH_SESSION_REQUEST_RENEWCAPS,
-       CEPH_SESSION_RENEWCAPS,
-       CEPH_SESSION_STALE,
-       CEPH_SESSION_RECALL_STATE,
-};
-
-extern const char *ceph_session_op_name(int op);
-
-struct ceph_mds_session_head {
-       __le32 op;
-       __le64 seq;
-       struct ceph_timespec stamp;
-       __le32 max_caps, max_leases;
-} __attribute__ ((packed));
-
-/* client_request */
-/*
- * metadata ops.
- *  & 0x001000 -> write op
- *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- &  & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE        0x001000
-enum {
-       CEPH_MDS_OP_LOOKUP     = 0x00100,
-       CEPH_MDS_OP_GETATTR    = 0x00101,
-       CEPH_MDS_OP_LOOKUPHASH = 0x00102,
-       CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-
-       CEPH_MDS_OP_SETXATTR   = 0x01105,
-       CEPH_MDS_OP_RMXATTR    = 0x01106,
-       CEPH_MDS_OP_SETLAYOUT  = 0x01107,
-       CEPH_MDS_OP_SETATTR    = 0x01108,
-       CEPH_MDS_OP_SETFILELOCK= 0x01109,
-       CEPH_MDS_OP_GETFILELOCK= 0x00110,
-
-       CEPH_MDS_OP_MKNOD      = 0x01201,
-       CEPH_MDS_OP_LINK       = 0x01202,
-       CEPH_MDS_OP_UNLINK     = 0x01203,
-       CEPH_MDS_OP_RENAME     = 0x01204,
-       CEPH_MDS_OP_MKDIR      = 0x01220,
-       CEPH_MDS_OP_RMDIR      = 0x01221,
-       CEPH_MDS_OP_SYMLINK    = 0x01222,
-
-       CEPH_MDS_OP_CREATE     = 0x01301,
-       CEPH_MDS_OP_OPEN       = 0x00302,
-       CEPH_MDS_OP_READDIR    = 0x00305,
-
-       CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
-       CEPH_MDS_OP_MKSNAP     = 0x01400,
-       CEPH_MDS_OP_RMSNAP     = 0x01401,
-       CEPH_MDS_OP_LSSNAP     = 0x00402,
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-
-#define CEPH_SETATTR_MODE   1
-#define CEPH_SETATTR_UID    2
-#define CEPH_SETATTR_GID    4
-#define CEPH_SETATTR_MTIME  8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE  32
-#define CEPH_SETATTR_CTIME 64
-
-union ceph_mds_request_args {
-       struct {
-               __le32 mask;                 /* CEPH_CAP_* */
-       } __attribute__ ((packed)) getattr;
-       struct {
-               __le32 mode;
-               __le32 uid;
-               __le32 gid;
-               struct ceph_timespec mtime;
-               struct ceph_timespec atime;
-               __le64 size, old_size;       /* old_size needed by truncate */
-               __le32 mask;                 /* CEPH_SETATTR_* */
-       } __attribute__ ((packed)) setattr;
-       struct {
-               __le32 frag;                 /* which dir fragment */
-               __le32 max_entries;          /* how many dentries to grab */
-               __le32 max_bytes;
-       } __attribute__ ((packed)) readdir;
-       struct {
-               __le32 mode;
-               __le32 rdev;
-       } __attribute__ ((packed)) mknod;
-       struct {
-               __le32 mode;
-       } __attribute__ ((packed)) mkdir;
-       struct {
-               __le32 flags;
-               __le32 mode;
-               __le32 stripe_unit;          /* layout for newly created file */
-               __le32 stripe_count;         /* ... */
-               __le32 object_size;
-               __le32 file_replication;
-               __le32 preferred;
-       } __attribute__ ((packed)) open;
-       struct {
-               __le32 flags;
-       } __attribute__ ((packed)) setxattr;
-       struct {
-               struct ceph_file_layout layout;
-       } __attribute__ ((packed)) setlayout;
-       struct {
-               __u8 rule; /* currently fcntl or flock */
-               __u8 type; /* shared, exclusive, remove*/
-               __le64 pid; /* process id requesting the lock */
-               __le64 pid_namespace;
-               __le64 start; /* initial location to lock */
-               __le64 length; /* num bytes to lock from start */
-               __u8 wait; /* will caller wait for lock to become available? */
-       } __attribute__ ((packed)) filelock_change;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
-
-struct ceph_mds_request_head {
-       __le64 oldest_client_tid;
-       __le32 mdsmap_epoch;           /* on client */
-       __le32 flags;                  /* CEPH_MDS_FLAG_* */
-       __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
-       __le16 num_releases;           /* # include cap/lease release records */
-       __le32 op;                     /* mds op code */
-       __le32 caller_uid, caller_gid;
-       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
-                                         etc. (if replaying) */
-       union ceph_mds_request_args args;
-} __attribute__ ((packed));
-
-/* cap/lease release record */
-struct ceph_mds_request_release {
-       __le64 ino, cap_id;            /* ino and unique cap id */
-       __le32 caps, wanted;           /* new issued, wanted */
-       __le32 seq, issue_seq, mseq;
-       __le32 dname_seq;              /* if releasing a dentry lease, a */
-       __le32 dname_len;              /* string follows. */
-} __attribute__ ((packed));
-
-/* client reply */
-struct ceph_mds_reply_head {
-       __le32 op;
-       __le32 result;
-       __le32 mdsmap_epoch;
-       __u8 safe;                     /* true if committed to disk */
-       __u8 is_dentry, is_target;     /* true if dentry, target inode records
-                                         are included with reply */
-} __attribute__ ((packed));
-
-/* one for each node split */
-struct ceph_frag_tree_split {
-       __le32 frag;                   /* this frag splits... */
-       __le32 by;                     /* ...by this many bits */
-} __attribute__ ((packed));
-
-struct ceph_frag_tree_head {
-       __le32 nsplits;                /* num ceph_frag_tree_split records */
-       struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
-       __le32 caps, wanted;           /* caps issued, wanted */
-       __le64 cap_id;
-       __le32 seq, mseq;
-       __le64 realm;                  /* snap realm */
-       __u8 flags;                    /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-
-#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
-
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
-       __le64 ino;
-       __le64 snapid;
-       __le32 rdev;
-       __le64 version;                /* inode version */
-       __le64 xattr_version;          /* version for xattr blob */
-       struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-       struct ceph_file_layout layout;
-       struct ceph_timespec ctime, mtime, atime;
-       __le32 time_warp_seq;
-       __le64 size, max_size, truncate_size;
-       __le32 truncate_seq;
-       __le32 mode, uid, gid;
-       __le32 nlink;
-       __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
-       struct ceph_timespec rctime;
-       struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
-       __le16 mask;            /* lease type(s) */
-       __le32 duration_ms;     /* lease duration */
-       __le32 seq;
-} __attribute__ ((packed));
-
-struct ceph_mds_reply_dirfrag {
-       __le32 frag;            /* fragment */
-       __le32 auth;            /* auth mds, if this is a delegation point */
-       __le32 ndist;           /* number of mds' this is replicated on */
-       __le32 dist[];
-} __attribute__ ((packed));
-
-#define CEPH_LOCK_FCNTL    1
-#define CEPH_LOCK_FLOCK    2
-
-#define CEPH_LOCK_SHARED   1
-#define CEPH_LOCK_EXCL     2
-#define CEPH_LOCK_UNLOCK   4
-
-struct ceph_filelock {
-       __le64 start;/* file offset to start lock at */
-       __le64 length; /* num bytes to lock; 0 for all following start */
-       __le64 client; /* which client holds the lock */
-       __le64 pid; /* process id holding the lock on the client */
-       __le64 pid_namespace;
-       __u8 type; /* shared lock, exclusive lock, or unlock */
-} __attribute__ ((packed));
-
-
-/* file access modes */
-#define CEPH_FILE_MODE_PIN        0
-#define CEPH_FILE_MODE_RD         1
-#define CEPH_FILE_MODE_WR         2
-#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
-#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
-#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
-
-int ceph_flags_to_mode(int flags);
-
-
-/* capability bits */
-#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
-
-/* generic cap bits */
-#define CEPH_CAP_GSHARED     1  /* client can reads */
-#define CEPH_CAP_GEXCL       2  /* client can read and update */
-#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
-#define CEPH_CAP_GRD         8  /* (file) client can read */
-#define CEPH_CAP_GWR        16  /* (file) client can write */
-#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
-
-/* per-lock shift */
-#define CEPH_CAP_SAUTH      2
-#define CEPH_CAP_SLINK      4
-#define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8
-#define CEPH_CAP_SFLOCK    20 
-
-#define CEPH_CAP_BITS       22
-
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
-#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
-#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
-
-
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
-#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                        \
-                                CEPH_CAP_AUTH_SHARED | \
-                                CEPH_CAP_LINK_SHARED | \
-                                CEPH_CAP_FILE_SHARED | \
-                                CEPH_CAP_XATTR_SHARED)
-
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                    \
-                             CEPH_CAP_LINK_SHARED |                    \
-                             CEPH_CAP_XATTR_SHARED |                   \
-                             CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |    \
-                          CEPH_CAP_FILE_CACHE)
-
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |                \
-                          CEPH_CAP_LINK_EXCL |         \
-                          CEPH_CAP_XATTR_EXCL |        \
-                          CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |        \
-                             CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-                          CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
-                          CEPH_CAP_PIN)
-
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
-                       CEPH_LOCK_IXATTR)
-
-int ceph_caps_for_mode(int mode);
-
-enum {
-       CEPH_CAP_OP_GRANT,         /* mds->client grant */
-       CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
-       CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
-       CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
-       CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
-       CEPH_CAP_OP_UPDATE,        /* client->mds update */
-       CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
-       CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
-       CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
-       CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
-       CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
-       CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
-       CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
-};
-
-extern const char *ceph_cap_op_name(int op);
-
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
-       __le32 op;                  /* CEPH_CAP_OP_* */
-       __le64 ino, realm;
-       __le64 cap_id;
-       __le32 seq, issue_seq;
-       __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
-       __le32 migrate_seq;
-       __le64 snap_follows;
-       __le32 snap_trace_len;
-
-       /* authlock */
-       __le32 uid, gid, mode;
-
-       /* linklock */
-       __le32 nlink;
-
-       /* xattrlock */
-       __le32 xattr_len;
-       __le64 xattr_version;
-
-       /* filelock */
-       __le64 size, max_size, truncate_size;
-       __le32 truncate_seq;
-       struct ceph_timespec mtime, atime, ctime;
-       struct ceph_file_layout layout;
-       __le32 time_warp_seq;
-} __attribute__ ((packed));
-
-/* cap release msg head */
-struct ceph_mds_cap_release {
-       __le32 num;                /* number of cap_items that follow */
-} __attribute__ ((packed));
-
-struct ceph_mds_cap_item {
-       __le64 ino;
-       __le64 cap_id;
-       __le32 migrate_seq, seq;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
-#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
-#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
-#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
-
-extern const char *ceph_lease_op_name(int o);
-
-/* lease msg header */
-struct ceph_mds_lease {
-       __u8 action;            /* CEPH_MDS_LEASE_* */
-       __le16 mask;            /* which lease */
-       __le64 ino;
-       __le64 first, last;     /* snap range */
-       __le32 seq;
-       __le32 duration_ms;     /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
-       __le64 cap_id;
-       __le32 wanted;
-       __le32 issued;
-       __le64 snaprealm;
-       __le64 pathbase;        /* base ino for our path to this ino */
-       __le32 flock_len;       /* size of flock state blob, if any */
-} __attribute__ ((packed));
-/* followed by flock blob */
-
-struct ceph_mds_cap_reconnect_v1 {
-       __le64 cap_id;
-       __le32 wanted;
-       __le32 issued;
-       __le64 size;
-       struct ceph_timespec mtime, atime;
-       __le64 snaprealm;
-       __le64 pathbase;        /* base ino for our path to this ino */
-} __attribute__ ((packed));
-
-struct ceph_mds_snaprealm_reconnect {
-       __le64 ino;     /* snap realm base */
-       __le64 seq;     /* snap seq for this snap realm */
-       __le64 parent;  /* parent realm */
-} __attribute__ ((packed));
-
-/*
- * snaps
- */
-enum {
-       CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
-       CEPH_SNAP_OP_CREATE,
-       CEPH_SNAP_OP_DESTROY,
-       CEPH_SNAP_OP_SPLIT,
-};
-
-extern const char *ceph_snap_op_name(int o);
-
-/* snap msg header */
-struct ceph_mds_snap_head {
-       __le32 op;                /* CEPH_SNAP_OP_* */
-       __le64 split;             /* ino to split off, if any */
-       __le32 num_split_inos;    /* # inos belonging to new child realm */
-       __le32 num_split_realms;  /* # child realms udner new child realm */
-       __le32 trace_len;         /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
-       __le64 ino;           /* ino */
-       __le64 created;       /* snap: when created */
-       __le64 parent;        /* ino: parent realm */
-       __le64 parent_since;  /* snap: same parent since */
-       __le64 seq;           /* snap: version */
-       __le32 num_snaps;
-       __le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c

deleted file mode 100644 (file)

index bd57001..0000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-
-#include "types.h"
-
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)                                           \
-       do {                                                    \
-               a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
-               b = b - c;  b = b - a;  b = b ^ (a << 8);       \
-               c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
-               a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
-               b = b - c;  b = b - a;  b = b ^ (a << 16);      \
-               c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
-               a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
-               b = b - c;  b = b - a;  b = b ^ (a << 10);      \
-               c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
-       } while (0)
-
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
-       const unsigned char *k = (const unsigned char *)str;
-       __u32 a, b, c;  /* the internal state */
-       __u32 len;      /* how many key bytes still need mixing */
-
-       /* Set up the internal state */
-       len = length;
-       a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-       b = a;
-       c = 0;               /* variable initialization of internal state */
-
-       /* handle most of the key */
-       while (len >= 12) {
-               a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-                        ((__u32)k[3] << 24));
-               b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-                        ((__u32)k[7] << 24));
-               c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-                        ((__u32)k[11] << 24));
-               mix(a, b, c);
-               k = k + 12;
-               len = len - 12;
-       }
-
-       /* handle the last 11 bytes */
-       c = c + length;
-       switch (len) {            /* all the case statements fall through */
-       case 11:
-               c = c + ((__u32)k[10] << 24);
-       case 10:
-               c = c + ((__u32)k[9] << 16);
-       case 9:
-               c = c + ((__u32)k[8] << 8);
-               /* the first byte of c is reserved for the length */
-       case 8:
-               b = b + ((__u32)k[7] << 24);
-       case 7:
-               b = b + ((__u32)k[6] << 16);
-       case 6:
-               b = b + ((__u32)k[5] << 8);
-       case 5:
-               b = b + k[4];
-       case 4:
-               a = a + ((__u32)k[3] << 24);
-       case 3:
-               a = a + ((__u32)k[2] << 16);
-       case 2:
-               a = a + ((__u32)k[1] << 8);
-       case 1:
-               a = a + k[0];
-               /* case 0: nothing left to add */
-       }
-       mix(a, b, c);
-
-       return c;
-}
-
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
-       unsigned long hash = 0;
-       unsigned char c;
-
-       while (length--) {
-               c = *str++;
-               hash = (hash + (c << 4) + (c >> 4)) * 11;
-       }
-       return hash;
-}
-
-
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
-       switch (type) {
-       case CEPH_STR_HASH_LINUX:
-               return ceph_str_hash_linux(s, len);
-       case CEPH_STR_HASH_RJENKINS:
-               return ceph_str_hash_rjenkins(s, len);
-       default:
-               return -1;
-       }
-}
-
-const char *ceph_str_hash_name(int type)
-{
-       switch (type) {
-       case CEPH_STR_HASH_LINUX:
-               return "linux";
-       case CEPH_STR_HASH_RJENKINS:
-               return "rjenkins";
-       default:
-               return "unknown";
-       }
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h

deleted file mode 100644 (file)

index d099c3f..0000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef FS_CEPH_HASH_H
-#define FS_CEPH_HASH_H
-
-#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
-
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-
-#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c

deleted file mode 100644 (file)

index c6179d3..0000000
--- a/fs/ceph/ceph_strings.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Ceph string constants
- */
-#include "types.h"
-
-const char *ceph_entity_type_name(int type)
-{
-       switch (type) {
-       case CEPH_ENTITY_TYPE_MDS: return "mds";
-       case CEPH_ENTITY_TYPE_OSD: return "osd";
-       case CEPH_ENTITY_TYPE_MON: return "mon";
-       case CEPH_ENTITY_TYPE_CLIENT: return "client";
-       case CEPH_ENTITY_TYPE_AUTH: return "auth";
-       default: return "unknown";
-       }
-}
-
-const char *ceph_osd_op_name(int op)
-{
-       switch (op) {
-       case CEPH_OSD_OP_READ: return "read";
-       case CEPH_OSD_OP_STAT: return "stat";
-
-       case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
-       case CEPH_OSD_OP_WRITE: return "write";
-       case CEPH_OSD_OP_DELETE: return "delete";
-       case CEPH_OSD_OP_TRUNCATE: return "truncate";
-       case CEPH_OSD_OP_ZERO: return "zero";
-       case CEPH_OSD_OP_WRITEFULL: return "writefull";
-       case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
-       case CEPH_OSD_OP_APPEND: return "append";
-       case CEPH_OSD_OP_STARTSYNC: return "startsync";
-       case CEPH_OSD_OP_SETTRUNC: return "settrunc";
-       case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
-       case CEPH_OSD_OP_TMAPUP: return "tmapup";
-       case CEPH_OSD_OP_TMAPGET: return "tmapget";
-       case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
-       case CEPH_OSD_OP_GETXATTR: return "getxattr";
-       case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
-       case CEPH_OSD_OP_SETXATTR: return "setxattr";
-       case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
-       case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
-       case CEPH_OSD_OP_RMXATTR: return "rmxattr";
-       case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
-       case CEPH_OSD_OP_PULL: return "pull";
-       case CEPH_OSD_OP_PUSH: return "push";
-       case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
-       case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
-       case CEPH_OSD_OP_SCRUB: return "scrub";
-
-       case CEPH_OSD_OP_WRLOCK: return "wrlock";
-       case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
-       case CEPH_OSD_OP_RDLOCK: return "rdlock";
-       case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
-       case CEPH_OSD_OP_UPLOCK: return "uplock";
-       case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
-       case CEPH_OSD_OP_CALL: return "call";
-
-       case CEPH_OSD_OP_PGLS: return "pgls";
-       }
-       return "???";
-}
-
-const char *ceph_mds_state_name(int s)
-{
-       switch (s) {
-               /* down and out */
-       case CEPH_MDS_STATE_DNE:        return "down:dne";
-       case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
-               /* up and out */
-       case CEPH_MDS_STATE_BOOT:       return "up:boot";
-       case CEPH_MDS_STATE_STANDBY:    return "up:standby";
-       case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
-       case CEPH_MDS_STATE_CREATING:   return "up:creating";
-       case CEPH_MDS_STATE_STARTING:   return "up:starting";
-               /* up and in */
-       case CEPH_MDS_STATE_REPLAY:     return "up:replay";
-       case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
-       case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
-       case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
-       case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
-       case CEPH_MDS_STATE_ACTIVE:     return "up:active";
-       case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
-       }
-       return "???";
-}
-
-const char *ceph_session_op_name(int op)
-{
-       switch (op) {
-       case CEPH_SESSION_REQUEST_OPEN: return "request_open";
-       case CEPH_SESSION_OPEN: return "open";
-       case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
-       case CEPH_SESSION_CLOSE: return "close";
-       case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
-       case CEPH_SESSION_RENEWCAPS: return "renewcaps";
-       case CEPH_SESSION_STALE: return "stale";
-       case CEPH_SESSION_RECALL_STATE: return "recall_state";
-       }
-       return "???";
-}
-
-const char *ceph_mds_op_name(int op)
-{
-       switch (op) {
-       case CEPH_MDS_OP_LOOKUP:  return "lookup";
-       case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
-       case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
-       case CEPH_MDS_OP_GETATTR:  return "getattr";
-       case CEPH_MDS_OP_SETXATTR: return "setxattr";
-       case CEPH_MDS_OP_SETATTR: return "setattr";
-       case CEPH_MDS_OP_RMXATTR: return "rmxattr";
-       case CEPH_MDS_OP_READDIR: return "readdir";
-       case CEPH_MDS_OP_MKNOD: return "mknod";
-       case CEPH_MDS_OP_LINK: return "link";
-       case CEPH_MDS_OP_UNLINK: return "unlink";
-       case CEPH_MDS_OP_RENAME: return "rename";
-       case CEPH_MDS_OP_MKDIR: return "mkdir";
-       case CEPH_MDS_OP_RMDIR: return "rmdir";
-       case CEPH_MDS_OP_SYMLINK: return "symlink";
-       case CEPH_MDS_OP_CREATE: return "create";
-       case CEPH_MDS_OP_OPEN: return "open";
-       case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
-       case CEPH_MDS_OP_LSSNAP: return "lssnap";
-       case CEPH_MDS_OP_MKSNAP: return "mksnap";
-       case CEPH_MDS_OP_RMSNAP: return "rmsnap";
-       case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
-       case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
-       }
-       return "???";
-}
-
-const char *ceph_cap_op_name(int op)
-{
-       switch (op) {
-       case CEPH_CAP_OP_GRANT: return "grant";
-       case CEPH_CAP_OP_REVOKE: return "revoke";
-       case CEPH_CAP_OP_TRUNC: return "trunc";
-       case CEPH_CAP_OP_EXPORT: return "export";
-       case CEPH_CAP_OP_IMPORT: return "import";
-       case CEPH_CAP_OP_UPDATE: return "update";
-       case CEPH_CAP_OP_DROP: return "drop";
-       case CEPH_CAP_OP_FLUSH: return "flush";
-       case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
-       case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
-       case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
-       case CEPH_CAP_OP_RELEASE: return "release";
-       case CEPH_CAP_OP_RENEW: return "renew";
-       }
-       return "???";
-}
-
-const char *ceph_lease_op_name(int o)
-{
-       switch (o) {
-       case CEPH_MDS_LEASE_REVOKE: return "revoke";
-       case CEPH_MDS_LEASE_RELEASE: return "release";
-       case CEPH_MDS_LEASE_RENEW: return "renew";
-       case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
-       }
-       return "???";
-}
-
-const char *ceph_snap_op_name(int o)
-{
-       switch (o) {
-       case CEPH_SNAP_OP_UPDATE: return "update";
-       case CEPH_SNAP_OP_CREATE: return "create";
-       case CEPH_SNAP_OP_DESTROY: return "destroy";
-       case CEPH_SNAP_OP_SPLIT: return "split";
-       }
-       return "???";
-}
-
-const char *ceph_pool_op_name(int op)
-{
-       switch (op) {
-       case POOL_OP_CREATE: return "create";
-       case POOL_OP_DELETE: return "delete";
-       case POOL_OP_AUID_CHANGE: return "auid change";
-       case POOL_OP_CREATE_SNAP: return "create snap";
-       case POOL_OP_DELETE_SNAP: return "delete snap";
-       case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
-       case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
-       }
-       return "???";
-}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c

deleted file mode 100644 (file)

index fabd302..0000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-
-#include "crush.h"
-
-const char *crush_bucket_alg_name(int alg)
-{
-       switch (alg) {
-       case CRUSH_BUCKET_UNIFORM: return "uniform";
-       case CRUSH_BUCKET_LIST: return "list";
-       case CRUSH_BUCKET_TREE: return "tree";
-       case CRUSH_BUCKET_STRAW: return "straw";
-       default: return "unknown";
-       }
-}
-
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
-       if (p >= b->size)
-               return 0;
-
-       switch (b->alg) {
-       case CRUSH_BUCKET_UNIFORM:
-               return ((struct crush_bucket_uniform *)b)->item_weight;
-       case CRUSH_BUCKET_LIST:
-               return ((struct crush_bucket_list *)b)->item_weights[p];
-       case CRUSH_BUCKET_TREE:
-               if (p & 1)
-                       return ((struct crush_bucket_tree *)b)->node_weights[p];
-               return 0;
-       case CRUSH_BUCKET_STRAW:
-               return ((struct crush_bucket_straw *)b)->item_weights[p];
-       }
-       return 0;
-}
-
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
-       int i, b, c;
-
-       for (b = 0; b < map->max_buckets; b++) {
-               if (map->buckets[b] == NULL)
-                       continue;
-               for (i = 0; i < map->buckets[b]->size; i++) {
-                       c = map->buckets[b]->items[i];
-                       BUG_ON(c >= map->max_devices ||
-                              c < -map->max_buckets);
-                       if (c >= 0)
-                               map->device_parents[c] = map->buckets[b]->id;
-                       else
-                               map->bucket_parents[-1-c] = map->buckets[b]->id;
-               }
-       }
-}
-
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
-       kfree(b->h.perm);
-       kfree(b->h.items);
-       kfree(b);
-}
-
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
-       kfree(b->item_weights);
-       kfree(b->sum_weights);
-       kfree(b->h.perm);
-       kfree(b->h.items);
-       kfree(b);
-}
-
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
-       kfree(b->node_weights);
-       kfree(b);
-}
-
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
-       kfree(b->straws);
-       kfree(b->item_weights);
-       kfree(b->h.perm);
-       kfree(b->h.items);
-       kfree(b);
-}
-
-void crush_destroy_bucket(struct crush_bucket *b)
-{
-       switch (b->alg) {
-       case CRUSH_BUCKET_UNIFORM:
-               crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
-               break;
-       case CRUSH_BUCKET_LIST:
-               crush_destroy_bucket_list((struct crush_bucket_list *)b);
-               break;
-       case CRUSH_BUCKET_TREE:
-               crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
-               break;
-       case CRUSH_BUCKET_STRAW:
-               crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
-               break;
-       }
-}
-
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
-       int b;
-
-       /* buckets */
-       if (map->buckets) {
-               for (b = 0; b < map->max_buckets; b++) {
-                       if (map->buckets[b] == NULL)
-                               continue;
-                       crush_destroy_bucket(map->buckets[b]);
-               }
-               kfree(map->buckets);
-       }
-
-       /* rules */
-       if (map->rules) {
-               for (b = 0; b < map->max_rules; b++)
-                       kfree(map->rules[b]);
-               kfree(map->rules);
-       }
-
-       kfree(map->bucket_parents);
-       kfree(map->device_parents);
-       kfree(map);
-}
-
-
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h

deleted file mode 100644 (file)

index 97e435b..0000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-
-#include <linux/types.h>
-
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-
-
-#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
-
-
-#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
-#define CRUSH_MAX_SET   10  /* max size of a mapping result */
-
-
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices.  A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
-       __u32 op;
-       __s32 arg1;
-       __s32 arg2;
-};
-
-/* step op codes */
-enum {
-       CRUSH_RULE_NOOP = 0,
-       CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
-       CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
-                                     /* arg2 = type */
-       CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
-       CRUSH_RULE_EMIT = 4,          /* no args */
-       CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
-       CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N            0
-#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
-
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
-       __u8 ruleset;
-       __u8 type;
-       __u8 min_size;
-       __u8 max_size;
-};
-
-struct crush_rule {
-       __u32 len;
-       struct crush_rule_mask mask;
-       struct crush_rule_step steps[0];
-};
-
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
-                             (len)*sizeof(struct crush_rule_step))
-
-
-
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).  Items within a bucket are chosen using one of a
- * few different algorithms.  The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- *  Bucket Alg     Speed       Additions    Removals
- *  ------------------------------------------------
- *  uniform         O(1)       poor         poor
- *  list            O(n)       optimal      poor
- *  tree            O(log n)   good         good
- *  straw           O(n)       optimal      optimal
- */
-enum {
-       CRUSH_BUCKET_UNIFORM = 1,
-       CRUSH_BUCKET_LIST = 2,
-       CRUSH_BUCKET_TREE = 3,
-       CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-
-struct crush_bucket {
-       __s32 id;        /* this'll be negative */
-       __u16 type;      /* non-zero; type=0 is reserved for devices */
-       __u8 alg;        /* one of CRUSH_BUCKET_* */
-       __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
-       __u32 weight;    /* 16-bit fixed point */
-       __u32 size;      /* num items */
-       __s32 *items;
-
-       /*
-        * cached random permutation: used for uniform bucket and for
-        * the linear search fallback for the other bucket types.
-        */
-       __u32 perm_x;  /* @x for which *perm is defined */
-       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
-       __u32 *perm;
-};
-
-struct crush_bucket_uniform {
-       struct crush_bucket h;
-       __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
-};
-
-struct crush_bucket_list {
-       struct crush_bucket h;
-       __u32 *item_weights;  /* 16-bit fixed point */
-       __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
-                                of weights 0..i, inclusive */
-};
-
-struct crush_bucket_tree {
-       struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
-                                  actual items */
-       __u8 num_nodes;
-       __u32 *node_weights;
-};
-
-struct crush_bucket_straw {
-       struct crush_bucket h;
-       __u32 *item_weights;   /* 16-bit fixed point */
-       __u32 *straws;         /* 16-bit fixed point */
-};
-
-
-
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
-       struct crush_bucket **buckets;
-       struct crush_rule **rules;
-
-       /*
-        * Parent pointers to identify the parent bucket a device or
-        * bucket in the hierarchy.  If an item appears more than
-        * once, this is the _last_ time it appeared (where buckets
-        * are processed in bucket id order, from -1 on down to
-        * -max_buckets.
-        */
-       __u32 *bucket_parents;
-       __u32 *device_parents;
-
-       __s32 max_buckets;
-       __u32 max_rules;
-       __s32 max_devices;
-};
-
-
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c

deleted file mode 100644 (file)

index 5873aed..0000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#include <linux/types.h>
-#include "hash.h"
-
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {                    \
-               a = a-b;  a = a-c;  a = a^(c>>13);      \
-               b = b-c;  b = b-a;  b = b^(a<<8);       \
-               c = c-a;  c = c-b;  c = c^(b>>13);      \
-               a = a-b;  a = a-c;  a = a^(c>>12);      \
-               b = b-c;  b = b-a;  b = b^(a<<16);      \
-               c = c-a;  c = c-b;  c = c^(b>>5);       \
-               a = a-b;  a = a-c;  a = a^(c>>3);       \
-               b = b-c;  b = b-a;  b = b^(a<<10);      \
-               c = c-a;  c = c-b;  c = c^(b>>15);      \
-       } while (0)
-
-#define crush_hash_seed 1315423911
-
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
-       __u32 hash = crush_hash_seed ^ a;
-       __u32 b = a;
-       __u32 x = 231232;
-       __u32 y = 1232;
-       crush_hashmix(b, x, hash);
-       crush_hashmix(y, a, hash);
-       return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
-       __u32 hash = crush_hash_seed ^ a ^ b;
-       __u32 x = 231232;
-       __u32 y = 1232;
-       crush_hashmix(a, b, hash);
-       crush_hashmix(x, a, hash);
-       crush_hashmix(b, y, hash);
-       return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
-       __u32 hash = crush_hash_seed ^ a ^ b ^ c;
-       __u32 x = 231232;
-       __u32 y = 1232;
-       crush_hashmix(a, b, hash);
-       crush_hashmix(c, x, hash);
-       crush_hashmix(y, a, hash);
-       crush_hashmix(b, x, hash);
-       crush_hashmix(y, c, hash);
-       return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
-       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-       __u32 x = 231232;
-       __u32 y = 1232;
-       crush_hashmix(a, b, hash);
-       crush_hashmix(c, d, hash);
-       crush_hashmix(a, x, hash);
-       crush_hashmix(y, b, hash);
-       crush_hashmix(c, x, hash);
-       crush_hashmix(y, d, hash);
-       return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
-                                     __u32 e)
-{
-       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-       __u32 x = 231232;
-       __u32 y = 1232;
-       crush_hashmix(a, b, hash);
-       crush_hashmix(c, d, hash);
-       crush_hashmix(e, x, hash);
-       crush_hashmix(y, a, hash);
-       crush_hashmix(b, x, hash);
-       crush_hashmix(y, c, hash);
-       crush_hashmix(d, x, hash);
-       crush_hashmix(y, e, hash);
-       return hash;
-}
-
-
-__u32 crush_hash32(int type, __u32 a)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return crush_hash32_rjenkins1(a);
-       default:
-               return 0;
-       }
-}
-
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return crush_hash32_rjenkins1_2(a, b);
-       default:
-               return 0;
-       }
-}
-
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return crush_hash32_rjenkins1_3(a, b, c);
-       default:
-               return 0;
-       }
-}
-
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return crush_hash32_rjenkins1_4(a, b, c, d);
-       default:
-               return 0;
-       }
-}
-
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return crush_hash32_rjenkins1_5(a, b, c, d, e);
-       default:
-               return 0;
-       }
-}
-
-const char *crush_hash_name(int type)
-{
-       switch (type) {
-       case CRUSH_HASH_RJENKINS1:
-               return "rjenkins1";
-       default:
-               return "unknown";
-       }
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h

deleted file mode 100644 (file)

index 91e8842..0000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CEPH_CRUSH_HASH_H
-#define CEPH_CRUSH_HASH_H
-
-#define CRUSH_HASH_RJENKINS1   0
-
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-
-extern const char *crush_hash_name(int type);
-
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
-                           __u32 e);
-
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c

deleted file mode 100644 (file)

index a4eec13..0000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-
-#include "crush.h"
-#include "hash.h"
-
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
-       int i;
-
-       for (i = 0; i < map->max_rules; i++) {
-               if (map->rules[i] &&
-                   map->rules[i]->mask.ruleset == ruleset &&
-                   map->rules[i]->mask.type == type &&
-                   map->rules[i]->mask.min_size <= size &&
-                   map->rules[i]->mask.max_size >= size)
-                       return i;
-       }
-       return -1;
-}
-
-
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors.  Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
-                             int x, int r)
-{
-       unsigned pr = r % bucket->size;
-       unsigned i, s;
-
-       /* start a new permutation if @x has changed */
-       if (bucket->perm_x != x || bucket->perm_n == 0) {
-               dprintk("bucket %d new x=%d\n", bucket->id, x);
-               bucket->perm_x = x;
-
-               /* optimize common r=0 case */
-               if (pr == 0) {
-                       s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
-                               bucket->size;
-                       bucket->perm[0] = s;
-                       bucket->perm_n = 0xffff;   /* magic value, see below */
-                       goto out;
-               }
-
-               for (i = 0; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm_n = 0;
-       } else if (bucket->perm_n == 0xffff) {
-               /* clean up after the r=0 case above */
-               for (i = 1; i < bucket->size; i++)
-                       bucket->perm[i] = i;
-               bucket->perm[bucket->perm[0]] = 0;
-               bucket->perm_n = 1;
-       }
-
-       /* calculate permutation up to pr */
-       for (i = 0; i < bucket->perm_n; i++)
-               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
-       while (bucket->perm_n <= pr) {
-               unsigned p = bucket->perm_n;
-               /* no point in swapping the final entry */
-               if (p < bucket->size - 1) {
-                       i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
-                               (bucket->size - p);
-                       if (i) {
-                               unsigned t = bucket->perm[p + i];
-                               bucket->perm[p + i] = bucket->perm[p];
-                               bucket->perm[p] = t;
-                       }
-                       dprintk(" perm_choose swap %d with %d\n", p, p+i);
-               }
-               bucket->perm_n++;
-       }
-       for (i = 0; i < bucket->size; i++)
-               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
-
-       s = bucket->perm[pr];
-out:
-       dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
-               bucket->size, x, r, pr, s);
-       return bucket->items[s];
-}
-
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
-                                int x, int r)
-{
-       return bucket_perm_choose(&bucket->h, x, r);
-}
-
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
-                             int x, int r)
-{
-       int i;
-
-       for (i = bucket->h.size-1; i >= 0; i--) {
-               __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
-                                        r, bucket->h.id);
-               w &= 0xffff;
-               dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
-                       "sw %x rand %llx",
-                       i, x, r, bucket->h.items[i], bucket->item_weights[i],
-                       bucket->sum_weights[i], w);
-               w *= bucket->sum_weights[i];
-               w = w >> 16;
-               /*dprintk(" scaled %llx\n", w);*/
-               if (w < bucket->item_weights[i])
-                       return bucket->h.items[i];
-       }
-
-       BUG_ON(1);
-       return 0;
-}
-
-
-/* (binary) tree */
-static int height(int n)
-{
-       int h = 0;
-       while ((n & 1) == 0) {
-               h++;
-               n = n >> 1;
-       }
-       return h;
-}
-
-static int left(int x)
-{
-       int h = height(x);
-       return x - (1 << (h-1));
-}
-
-static int right(int x)
-{
-       int h = height(x);
-       return x + (1 << (h-1));
-}
-
-static int terminal(int x)
-{
-       return x & 1;
-}
-
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
-                             int x, int r)
-{
-       int n, l;
-       __u32 w;
-       __u64 t;
-
-       /* start at root */
-       n = bucket->num_nodes >> 1;
-
-       while (!terminal(n)) {
-               /* pick point in [0, w) */
-               w = bucket->node_weights[n];
-               t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
-                                         bucket->h.id) * (__u64)w;
-               t = t >> 32;
-
-               /* descend to the left or right? */
-               l = left(n);
-               if (t < bucket->node_weights[l])
-                       n = l;
-               else
-                       n = right(n);
-       }
-
-       return bucket->h.items[n >> 1];
-}
-
-
-/* straw */
-
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
-                              int x, int r)
-{
-       int i;
-       int high = 0;
-       __u64 high_draw = 0;
-       __u64 draw;
-
-       for (i = 0; i < bucket->h.size; i++) {
-               draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
-               draw &= 0xffff;
-               draw *= bucket->straws[i];
-               if (i == 0 || draw > high_draw) {
-                       high = i;
-                       high_draw = draw;
-               }
-       }
-       return bucket->h.items[high];
-}
-
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
-       dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
-       switch (in->alg) {
-       case CRUSH_BUCKET_UNIFORM:
-               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
-                                         x, r);
-       case CRUSH_BUCKET_LIST:
-               return bucket_list_choose((struct crush_bucket_list *)in,
-                                         x, r);
-       case CRUSH_BUCKET_TREE:
-               return bucket_tree_choose((struct crush_bucket_tree *)in,
-                                         x, r);
-       case CRUSH_BUCKET_STRAW:
-               return bucket_straw_choose((struct crush_bucket_straw *)in,
-                                          x, r);
-       default:
-               BUG_ON(1);
-               return in->items[0];
-       }
-}
-
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
-       if (weight[item] >= 0x10000)
-               return 0;
-       if (weight[item] == 0)
-               return 1;
-       if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
-           < weight[item])
-               return 0;
-       return 1;
-}
-
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
-                       struct crush_bucket *bucket,
-                       __u32 *weight,
-                       int x, int numrep, int type,
-                       int *out, int outpos,
-                       int firstn, int recurse_to_leaf,
-                       int *out2)
-{
-       int rep;
-       int ftotal, flocal;
-       int retry_descent, retry_bucket, skip_rep;
-       struct crush_bucket *in = bucket;
-       int r;
-       int i;
-       int item = 0;
-       int itemtype;
-       int collide, reject;
-       const int orig_tries = 5; /* attempts before we fall back to search */
-
-       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
-               bucket->id, x, outpos, numrep);
-
-       for (rep = outpos; rep < numrep; rep++) {
-               /* keep trying until we get a non-out, non-colliding item */
-               ftotal = 0;
-               skip_rep = 0;
-               do {
-                       retry_descent = 0;
-                       in = bucket;               /* initial bucket */
-
-                       /* choose through intervening buckets */
-                       flocal = 0;
-                       do {
-                               collide = 0;
-                               retry_bucket = 0;
-                               r = rep;
-                               if (in->alg == CRUSH_BUCKET_UNIFORM) {
-                                       /* be careful */
-                                       if (firstn || numrep >= in->size)
-                                               /* r' = r + f_total */
-                                               r += ftotal;
-                                       else if (in->size % numrep == 0)
-                                               /* r'=r+(n+1)*f_local */
-                                               r += (numrep+1) *
-                                                       (flocal+ftotal);
-                                       else
-                                               /* r' = r + n*f_local */
-                                               r += numrep * (flocal+ftotal);
-                               } else {
-                                       if (firstn)
-                                               /* r' = r + f_total */
-                                               r += ftotal;
-                                       else
-                                               /* r' = r + n*f_local */
-                                               r += numrep * (flocal+ftotal);
-                               }
-
-                               /* bucket choose */
-                               if (in->size == 0) {
-                                       reject = 1;
-                                       goto reject;
-                               }
-                               if (flocal >= (in->size>>1) &&
-                                   flocal > orig_tries)
-                                       item = bucket_perm_choose(in, x, r);
-                               else
-                                       item = crush_bucket_choose(in, x, r);
-                               BUG_ON(item >= map->max_devices);
-
-                               /* desired type? */
-                               if (item < 0)
-                                       itemtype = map->buckets[-1-item]->type;
-                               else
-                                       itemtype = 0;
-                               dprintk("  item %d type %d\n", item, itemtype);
-
-                               /* keep going? */
-                               if (itemtype != type) {
-                                       BUG_ON(item >= 0 ||
-                                              (-1-item) >= map->max_buckets);
-                                       in = map->buckets[-1-item];
-                                       retry_bucket = 1;
-                                       continue;
-                               }
-
-                               /* collision? */
-                               for (i = 0; i < outpos; i++) {
-                                       if (out[i] == item) {
-                                               collide = 1;
-                                               break;
-                                       }
-                               }
-
-                               reject = 0;
-                               if (recurse_to_leaf) {
-                                       if (item < 0) {
-                                               if (crush_choose(map,
-                                                        map->buckets[-1-item],
-                                                        weight,
-                                                        x, outpos+1, 0,
-                                                        out2, outpos,
-                                                        firstn, 0,
-                                                        NULL) <= outpos)
-                                                       /* didn't get leaf */
-                                                       reject = 1;
-                                       } else {
-                                               /* we already have a leaf! */
-                                               out2[outpos] = item;
-                                       }
-                               }
-
-                               if (!reject) {
-                                       /* out? */
-                                       if (itemtype == 0)
-                                               reject = is_out(map, weight,
-                                                               item, x);
-                                       else
-                                               reject = 0;
-                               }
-
-reject:
-                               if (reject || collide) {
-                                       ftotal++;
-                                       flocal++;
-
-                                       if (collide && flocal < 3)
-                                               /* retry locally a few times */
-                                               retry_bucket = 1;
-                                       else if (flocal < in->size + orig_tries)
-                                               /* exhaustive bucket search */
-                                               retry_bucket = 1;
-                                       else if (ftotal < 20)
-                                               /* then retry descent */
-                                               retry_descent = 1;
-                                       else
-                                               /* else give up */
-                                               skip_rep = 1;
-                                       dprintk("  reject %d  collide %d  "
-                                               "ftotal %d  flocal %d\n",
-                                               reject, collide, ftotal,
-                                               flocal);
-                               }
-                       } while (retry_bucket);
-               } while (retry_descent);
-
-               if (skip_rep) {
-                       dprintk("skip rep\n");
-                       continue;
-               }
-
-               dprintk("CHOOSE got %d\n", item);
-               out[outpos] = item;
-               outpos++;
-       }
-
-       dprintk("CHOOSE returns %d\n", outpos);
-       return outpos;
-}
-
-
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
-                 int ruleno, int x, int *result, int result_max,
-                 int force, __u32 *weight)
-{
-       int result_len;
-       int force_context[CRUSH_MAX_DEPTH];
-       int force_pos = -1;
-       int a[CRUSH_MAX_SET];
-       int b[CRUSH_MAX_SET];
-       int c[CRUSH_MAX_SET];
-       int recurse_to_leaf;
-       int *w;
-       int wsize = 0;
-       int *o;
-       int osize;
-       int *tmp;
-       struct crush_rule *rule;
-       int step;
-       int i, j;
-       int numrep;
-       int firstn;
-       int rc = -1;
-
-       BUG_ON(ruleno >= map->max_rules);
-
-       rule = map->rules[ruleno];
-       result_len = 0;
-       w = a;
-       o = b;
-
-       /*
-        * determine hierarchical context of force, if any.  note
-        * that this may or may not correspond to the specific types
-        * referenced by the crush rule.
-        */
-       if (force >= 0) {
-               if (force >= map->max_devices ||
-                   map->device_parents[force] == 0) {
-                       /*dprintk("CRUSH: forcefed device dne\n");*/
-                       rc = -1;  /* force fed device dne */
-                       goto out;
-               }
-               if (!is_out(map, weight, force, x)) {
-                       while (1) {
-                               force_context[++force_pos] = force;
-                               if (force >= 0)
-                                       force = map->device_parents[force];
-                               else
-                                       force = map->bucket_parents[-1-force];
-                               if (force == 0)
-                                       break;
-                       }
-               }
-       }
-
-       for (step = 0; step < rule->len; step++) {
-               firstn = 0;
-               switch (rule->steps[step].op) {
-               case CRUSH_RULE_TAKE:
-                       w[0] = rule->steps[step].arg1;
-                       if (force_pos >= 0) {
-                               BUG_ON(force_context[force_pos] != w[0]);
-                               force_pos--;
-                       }
-                       wsize = 1;
-                       break;
-
-               case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
-               case CRUSH_RULE_CHOOSE_FIRSTN:
-                       firstn = 1;
-               case CRUSH_RULE_CHOOSE_LEAF_INDEP:
-               case CRUSH_RULE_CHOOSE_INDEP:
-                       BUG_ON(wsize == 0);
-
-                       recurse_to_leaf =
-                               rule->steps[step].op ==
-                                CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
-                               rule->steps[step].op ==
-                               CRUSH_RULE_CHOOSE_LEAF_INDEP;
-
-                       /* reset output */
-                       osize = 0;
-
-                       for (i = 0; i < wsize; i++) {
-                               /*
-                                * see CRUSH_N, CRUSH_N_MINUS macros.
-                                * basically, numrep <= 0 means relative to
-                                * the provided result_max
-                                */
-                               numrep = rule->steps[step].arg1;
-                               if (numrep <= 0) {
-                                       numrep += result_max;
-                                       if (numrep <= 0)
-                                               continue;
-                               }
-                               j = 0;
-                               if (osize == 0 && force_pos >= 0) {
-                                       /* skip any intermediate types */
-                                       while (force_pos &&
-                                              force_context[force_pos] < 0 &&
-                                              rule->steps[step].arg2 !=
-                                              map->buckets[-1 -
-                                              force_context[force_pos]]->type)
-                                               force_pos--;
-                                       o[osize] = force_context[force_pos];
-                                       if (recurse_to_leaf)
-                                               c[osize] = force_context[0];
-                                       j++;
-                                       force_pos--;
-                               }
-                               osize += crush_choose(map,
-                                                     map->buckets[-1-w[i]],
-                                                     weight,
-                                                     x, numrep,
-                                                     rule->steps[step].arg2,
-                                                     o+osize, j,
-                                                     firstn,
-                                                     recurse_to_leaf, c+osize);
-                       }
-
-                       if (recurse_to_leaf)
-                               /* copy final _leaf_ values to output set */
-                               memcpy(o, c, osize*sizeof(*o));
-
-                       /* swap t and w arrays */
-                       tmp = o;
-                       o = w;
-                       w = tmp;
-                       wsize = osize;
-                       break;
-
-
-               case CRUSH_RULE_EMIT:
-                       for (i = 0; i < wsize && result_len < result_max; i++) {
-                               result[result_len] = w[i];
-                               result_len++;
-                       }
-                       wsize = 0;
-                       break;
-
-               default:
-                       BUG_ON(1);
-               }
-       }
-       rc = result_len;
-
-out:
-       return rc;
-}
-
-
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h

deleted file mode 100644 (file)

index c46b99c..0000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef CEPH_CRUSH_MAPPER_H
-#define CEPH_CRUSH_MAPPER_H
-
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-
-#include "crush.h"
-
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
-                        int ruleno,
-                        int x, int *result, int result_max,
-                        int forcefeed,    /* -1 for none */
-                        __u32 *weights);
-
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c

deleted file mode 100644 (file)

index a3e627f..0000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-
-#include "crypto.h"
-#include "decode.h"
-
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
-       if (*p + sizeof(u16) + sizeof(key->created) +
-           sizeof(u16) + key->len > end)
-               return -ERANGE;
-       ceph_encode_16(p, key->type);
-       ceph_encode_copy(p, &key->created, sizeof(key->created));
-       ceph_encode_16(p, key->len);
-       ceph_encode_copy(p, key->key, key->len);
-       return 0;
-}
-
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
-       ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
-       key->type = ceph_decode_16(p);
-       ceph_decode_copy(p, &key->created, sizeof(key->created));
-       key->len = ceph_decode_16(p);
-       ceph_decode_need(p, end, key->len, bad);
-       key->key = kmalloc(key->len, GFP_NOFS);
-       if (!key->key)
-               return -ENOMEM;
-       ceph_decode_copy(p, key->key, key->len);
-       return 0;
-
-bad:
-       dout("failed to decode crypto key\n");
-       return -EINVAL;
-}
-
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
-       int inlen = strlen(inkey);
-       int blen = inlen * 3 / 4;
-       void *buf, *p;
-       int ret;
-
-       dout("crypto_key_unarmor %s\n", inkey);
-       buf = kmalloc(blen, GFP_NOFS);
-       if (!buf)
-               return -ENOMEM;
-       blen = ceph_unarmor(buf, inkey, inkey+inlen);
-       if (blen < 0) {
-               kfree(buf);
-               return blen;
-       }
-
-       p = buf;
-       ret = ceph_crypto_key_decode(key, &p, p + blen);
-       kfree(buf);
-       if (ret)
-               return ret;
-       dout("crypto_key_unarmor key %p type %d len %d\n", key,
-            key->type, key->len);
-       return 0;
-}
-
-
-
-#define AES_KEY_SIZE 16
-
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
-       return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-
-static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-
-static int ceph_aes_encrypt(const void *key, int key_len,
-                           void *dst, size_t *dst_len,
-                           const void *src, size_t src_len)
-{
-       struct scatterlist sg_in[2], sg_out[1];
-       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-       struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-       int ret;
-       void *iv;
-       int ivsize;
-       size_t zero_padding = (0x10 - (src_len & 0x0f));
-       char pad[16];
-
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       memset(pad, zero_padding, zero_padding);
-
-       *dst_len = src_len + zero_padding;
-
-       crypto_blkcipher_setkey((void *)tfm, key, key_len);
-       sg_init_table(sg_in, 2);
-       sg_set_buf(&sg_in[0], src, src_len);
-       sg_set_buf(&sg_in[1], pad, zero_padding);
-       sg_init_table(sg_out, 1);
-       sg_set_buf(sg_out, dst, *dst_len);
-       iv = crypto_blkcipher_crt(tfm)->iv;
-       ivsize = crypto_blkcipher_ivsize(tfm);
-
-       memcpy(iv, aes_iv, ivsize);
-       /*
-       print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
-                      key, key_len, 1);
-       print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
-                       src, src_len, 1);
-       print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
-                       pad, zero_padding, 1);
-       */
-       ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                    src_len + zero_padding);
-       crypto_free_blkcipher(tfm);
-       if (ret < 0)
-               pr_err("ceph_aes_crypt failed %d\n", ret);
-       /*
-       print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
-                      dst, *dst_len, 1);
-       */
-       return 0;
-}
-
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-                            size_t *dst_len,
-                            const void *src1, size_t src1_len,
-                            const void *src2, size_t src2_len)
-{
-       struct scatterlist sg_in[3], sg_out[1];
-       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-       struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
-       int ret;
-       void *iv;
-       int ivsize;
-       size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
-       char pad[16];
-
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       memset(pad, zero_padding, zero_padding);
-
-       *dst_len = src1_len + src2_len + zero_padding;
-
-       crypto_blkcipher_setkey((void *)tfm, key, key_len);
-       sg_init_table(sg_in, 3);
-       sg_set_buf(&sg_in[0], src1, src1_len);
-       sg_set_buf(&sg_in[1], src2, src2_len);
-       sg_set_buf(&sg_in[2], pad, zero_padding);
-       sg_init_table(sg_out, 1);
-       sg_set_buf(sg_out, dst, *dst_len);
-       iv = crypto_blkcipher_crt(tfm)->iv;
-       ivsize = crypto_blkcipher_ivsize(tfm);
-
-       memcpy(iv, aes_iv, ivsize);
-       /*
-       print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
-                      key, key_len, 1);
-       print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
-                       src1, src1_len, 1);
-       print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
-                       src2, src2_len, 1);
-       print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
-                       pad, zero_padding, 1);
-       */
-       ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
-                                    src1_len + src2_len + zero_padding);
-       crypto_free_blkcipher(tfm);
-       if (ret < 0)
-               pr_err("ceph_aes_crypt2 failed %d\n", ret);
-       /*
-       print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
-                      dst, *dst_len, 1);
-       */
-       return 0;
-}
-
-static int ceph_aes_decrypt(const void *key, int key_len,
-                           void *dst, size_t *dst_len,
-                           const void *src, size_t src_len)
-{
-       struct scatterlist sg_in[1], sg_out[2];
-       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-       struct blkcipher_desc desc = { .tfm = tfm };
-       char pad[16];
-       void *iv;
-       int ivsize;
-       int ret;
-       int last_byte;
-
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       crypto_blkcipher_setkey((void *)tfm, key, key_len);
-       sg_init_table(sg_in, 1);
-       sg_init_table(sg_out, 2);
-       sg_set_buf(sg_in, src, src_len);
-       sg_set_buf(&sg_out[0], dst, *dst_len);
-       sg_set_buf(&sg_out[1], pad, sizeof(pad));
-
-       iv = crypto_blkcipher_crt(tfm)->iv;
-       ivsize = crypto_blkcipher_ivsize(tfm);
-
-       memcpy(iv, aes_iv, ivsize);
-
-       /*
-       print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
-                      key, key_len, 1);
-       print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
-                      src, src_len, 1);
-       */
-
-       ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-       crypto_free_blkcipher(tfm);
-       if (ret < 0) {
-               pr_err("ceph_aes_decrypt failed %d\n", ret);
-               return ret;
-       }
-
-       if (src_len <= *dst_len)
-               last_byte = ((char *)dst)[src_len - 1];
-       else
-               last_byte = pad[src_len - *dst_len - 1];
-       if (last_byte <= 16 && src_len >= last_byte) {
-               *dst_len = src_len - last_byte;
-       } else {
-               pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                      last_byte, (int)src_len);
-               return -EPERM;  /* bad padding */
-       }
-       /*
-       print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
-                      dst, *dst_len, 1);
-       */
-       return 0;
-}
-
-static int ceph_aes_decrypt2(const void *key, int key_len,
-                            void *dst1, size_t *dst1_len,
-                            void *dst2, size_t *dst2_len,
-                            const void *src, size_t src_len)
-{
-       struct scatterlist sg_in[1], sg_out[3];
-       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
-       struct blkcipher_desc desc = { .tfm = tfm };
-       char pad[16];
-       void *iv;
-       int ivsize;
-       int ret;
-       int last_byte;
-
-       if (IS_ERR(tfm))
-               return PTR_ERR(tfm);
-
-       sg_init_table(sg_in, 1);
-       sg_set_buf(sg_in, src, src_len);
-       sg_init_table(sg_out, 3);
-       sg_set_buf(&sg_out[0], dst1, *dst1_len);
-       sg_set_buf(&sg_out[1], dst2, *dst2_len);
-       sg_set_buf(&sg_out[2], pad, sizeof(pad));
-
-       crypto_blkcipher_setkey((void *)tfm, key, key_len);
-       iv = crypto_blkcipher_crt(tfm)->iv;
-       ivsize = crypto_blkcipher_ivsize(tfm);
-
-       memcpy(iv, aes_iv, ivsize);
-
-       /*
-       print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
-                      key, key_len, 1);
-       print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
-                      src, src_len, 1);
-       */
-
-       ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-       crypto_free_blkcipher(tfm);
-       if (ret < 0) {
-               pr_err("ceph_aes_decrypt failed %d\n", ret);
-               return ret;
-       }
-
-       if (src_len <= *dst1_len)
-               last_byte = ((char *)dst1)[src_len - 1];
-       else if (src_len <= *dst1_len + *dst2_len)
-               last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
-       else
-               last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
-       if (last_byte <= 16 && src_len >= last_byte) {
-               src_len -= last_byte;
-       } else {
-               pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
-                      last_byte, (int)src_len);
-               return -EPERM;  /* bad padding */
-       }
-
-       if (src_len < *dst1_len) {
-               *dst1_len = src_len;
-               *dst2_len = 0;
-       } else {
-               *dst2_len = src_len - *dst1_len;
-       }
-       /*
-       print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
-                      dst1, *dst1_len, 1);
-       print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
-                      dst2, *dst2_len, 1);
-       */
-
-       return 0;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                const void *src, size_t src_len)
-{
-       switch (secret->type) {
-       case CEPH_CRYPTO_NONE:
-               if (*dst_len < src_len)
-                       return -ERANGE;
-               memcpy(dst, src, src_len);
-               *dst_len = src_len;
-               return 0;
-
-       case CEPH_CRYPTO_AES:
-               return ceph_aes_decrypt(secret->key, secret->len, dst,
-                                       dst_len, src, src_len);
-
-       default:
-               return -EINVAL;
-       }
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
-                       void *dst1, size_t *dst1_len,
-                       void *dst2, size_t *dst2_len,
-                       const void *src, size_t src_len)
-{
-       size_t t;
-
-       switch (secret->type) {
-       case CEPH_CRYPTO_NONE:
-               if (*dst1_len + *dst2_len < src_len)
-                       return -ERANGE;
-               t = min(*dst1_len, src_len);
-               memcpy(dst1, src, t);
-               *dst1_len = t;
-               src += t;
-               src_len -= t;
-               if (src_len) {
-                       t = min(*dst2_len, src_len);
-                       memcpy(dst2, src, t);
-                       *dst2_len = t;
-               }
-               return 0;
-
-       case CEPH_CRYPTO_AES:
-               return ceph_aes_decrypt2(secret->key, secret->len,
-                                        dst1, dst1_len, dst2, dst2_len,
-                                        src, src_len);
-
-       default:
-               return -EINVAL;
-       }
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                const void *src, size_t src_len)
-{
-       switch (secret->type) {
-       case CEPH_CRYPTO_NONE:
-               if (*dst_len < src_len)
-                       return -ERANGE;
-               memcpy(dst, src, src_len);
-               *dst_len = src_len;
-               return 0;
-
-       case CEPH_CRYPTO_AES:
-               return ceph_aes_encrypt(secret->key, secret->len, dst,
-                                       dst_len, src, src_len);
-
-       default:
-               return -EINVAL;
-       }
-}
-
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
-                 const void *src1, size_t src1_len,
-                 const void *src2, size_t src2_len)
-{
-       switch (secret->type) {
-       case CEPH_CRYPTO_NONE:
-               if (*dst_len < src1_len + src2_len)
-                       return -ERANGE;
-               memcpy(dst, src1, src1_len);
-               memcpy(dst + src1_len, src2, src2_len);
-               *dst_len = src1_len + src2_len;
-               return 0;
-
-       case CEPH_CRYPTO_AES:
-               return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
-                                        src1, src1_len, src2, src2_len);
-
-       default:
-               return -EINVAL;
-       }
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h

deleted file mode 100644 (file)

index bdf3860..0000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
-       int type;
-       struct ceph_timespec created;
-       int len;
-       void *key;
-};
-
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
-       kfree(key->key);
-}
-
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
-                                 void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
-                                 void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
-                       void *dst, size_t *dst_len,
-                       const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
-                       void *dst, size_t *dst_len,
-                       const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
-                       void *dst1, size_t *dst1_len,
-                       void *dst2, size_t *dst2_len,
-                       const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
-                        void *dst, size_t *dst_len,
-                        const void *src1, size_t src1_len,
-                        const void *src2, size_t src2_len);
-
-/* armor.c */
-extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(char *dst, const char *src, const char *end);
-
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c

index 6fd8b20..7ae1b3d 100644 (file)
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/device.h>
  #include <linux/slab.h>
@@ -7,143 +7,49 @@
  #include <linux/debugfs.h>
  #include <linux/seq_file.h>
  
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
  #include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
  
  #ifdef CONFIG_DEBUG_FS
  
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
- *      .../osdmap      - current osdmap
- *      .../mdsmap      - current mdsmap
- *      .../monmap      - current monmap
- *      .../osdc        - active osd requests
- *      .../mdsc        - active mds requests
- *      .../monc        - mon client state
- *      .../dentry_lru  - dump contents of dentry lru
- *      .../caps        - expose cap (reservation) stats
- *      .../bdi         - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
-       int i;
-       struct ceph_client *client = s->private;
-
-       if (client->monc.monmap == NULL)
-               return 0;
-
-       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
-       for (i = 0; i < client->monc.monmap->num_mon; i++) {
-               struct ceph_entity_inst *inst =
-                       &client->monc.monmap->mon_inst[i];
-
-               seq_printf(s, "\t%s%lld\t%s\n",
-                          ENTITY_NAME(inst->name),
-                          pr_addr(&inst->addr.in_addr));
-       }
-       return 0;
-}
+#include "mds_client.h"
  
  static int mdsmap_show(struct seq_file *s, void *p)
  {
         int i;
-       struct ceph_client *client = s->private;
+       struct ceph_fs_client *fsc = s->private;
  
-       if (client->mdsc.mdsmap == NULL)
+       if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
                 return 0;
-       seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
-       seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+       seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+       seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
         seq_printf(s, "session_timeout %d\n",
-                      client->mdsc.mdsmap->m_session_timeout);
+                      fsc->mdsc->mdsmap->m_session_timeout);
         seq_printf(s, "session_autoclose %d\n",
-                      client->mdsc.mdsmap->m_session_autoclose);
-       for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+                      fsc->mdsc->mdsmap->m_session_autoclose);
+       for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
                 struct ceph_entity_addr *addr =
-                       &client->mdsc.mdsmap->m_info[i].addr;
-               int state = client->mdsc.mdsmap->m_info[i].state;
+                       &fsc->mdsc->mdsmap->m_info[i].addr;
+               int state = fsc->mdsc->mdsmap->m_info[i].state;
  
-               seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+               seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+                              ceph_pr_addr(&addr->in_addr),
                                ceph_mds_state_name(state));
         }
         return 0;
  }
  
-static int osdmap_show(struct seq_file *s, void *p)
-{
-       int i;
-       struct ceph_client *client = s->private;
-       struct rb_node *n;
-
-       if (client->osdc.osdmap == NULL)
-               return 0;
-       seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
-       seq_printf(s, "flags%s%s\n",
-                  (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
-                  " NEARFULL" : "",
-                  (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-                  " FULL" : "");
-       for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
-               struct ceph_pg_pool_info *pool =
-                       rb_entry(n, struct ceph_pg_pool_info, node);
-               seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-                          pool->id, pool->v.pg_num, pool->pg_num_mask,
-                          pool->v.lpg_num, pool->lpg_num_mask);
-       }
-       for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
-               struct ceph_entity_addr *addr =
-                       &client->osdc.osdmap->osd_addr[i];
-               int state = client->osdc.osdmap->osd_state[i];
-               char sb[64];
-
-               seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
-                          i, pr_addr(&addr->in_addr),
-                          ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
-                          ceph_osdmap_state_str(sb, sizeof(sb), state));
-       }
-       return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
-       struct ceph_client *client = s->private;
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_client *monc = &client->monc;
-       struct rb_node *rp;
-
-       mutex_lock(&monc->mutex);
-
-       if (monc->have_mdsmap)
-               seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
-       if (monc->have_osdmap)
-               seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
-       if (monc->want_next_osdmap)
-               seq_printf(s, "want next osdmap\n");
-
-       for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-               __u16 op;
-               req = rb_entry(rp, struct ceph_mon_generic_request, node);
-               op = le16_to_cpu(req->request->hdr.type);
-               if (op == CEPH_MSG_STATFS)
-                       seq_printf(s, "%lld statfs\n", req->tid);
-               else
-                       seq_printf(s, "%lld unknown\n", req->tid);
-       }
-
-       mutex_unlock(&monc->mutex);
-       return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
  static int mdsc_show(struct seq_file *s, void *p)
  {
-       struct ceph_client *client = s->private;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = s->private;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         struct rb_node *rp;
         int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
         return 0;
  }
  
-static int osdc_show(struct seq_file *s, void *pp)
-{
-       struct ceph_client *client = s->private;
-       struct ceph_osd_client *osdc = &client->osdc;
-       struct rb_node *p;
-
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               struct ceph_osd_request *req;
-               struct ceph_osd_request_head *head;
-               struct ceph_osd_op *op;
-               int num_ops;
-               int opcode, olen;
-               int i;
-
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-
-               seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
-                          req->r_osd ? req->r_osd->o_osd : -1,
-                          le32_to_cpu(req->r_pgid.pool),
-                          le16_to_cpu(req->r_pgid.ps));
-
-               head = req->r_request->front.iov_base;
-               op = (void *)(head + 1);
-
-               num_ops = le16_to_cpu(head->num_ops);
-               olen = le32_to_cpu(head->object_len);
-               seq_printf(s, "%.*s", olen,
-                          (const char *)(head->ops + num_ops));
-
-               if (req->r_reassert_version.epoch)
-                       seq_printf(s, "\t%u'%llu",
-                          (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
-                          le64_to_cpu(req->r_reassert_version.version));
-               else
-                       seq_printf(s, "\t");
-
-               for (i = 0; i < num_ops; i++) {
-                       opcode = le16_to_cpu(op->op);
-                       seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
-                       op++;
-               }
-
-               seq_printf(s, "\n");
-       }
-       mutex_unlock(&osdc->request_mutex);
-       return 0;
-}
-
  static int caps_show(struct seq_file *s, void *p)
  {
-       struct ceph_client *client = s->private;
+       struct ceph_fs_client *fsc = s->private;
         int total, avail, used, reserved, min;
  
-       ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+       ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
         seq_printf(s, "total\t\t%d\n"
                    "avail\t\t%d\n"
                    "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
  
  static int dentry_lru_show(struct seq_file *s, void *ptr)
  {
-       struct ceph_client *client = s->private;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = s->private;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_dentry_info *di;
  
         spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
         return 0;
  }
  
-#define DEFINE_SHOW_FUNC(name)                                         \
-static int name##_open(struct inode *inode, struct file *file)         \
-{                                                                      \
-       struct seq_file *sf;                                            \
-       int ret;                                                        \
-                                                                       \
-       ret = single_open(file, name, NULL);                            \
-       sf = file->private_data;                                        \
-       sf->private = inode->i_private;                                 \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static const struct file_operations name##_fops = {                    \
-       .open           = name##_open,                                  \
-       .read           = seq_read,                                     \
-       .llseek         = seq_lseek,                                    \
-       .release        = single_release,                               \
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
  
+/*
+ * debugfs
+ */
  static int congestion_kb_set(void *data, u64 val)
  {
-       struct ceph_client *client = (struct ceph_client *)data;
-
-       if (client)
-               client->mount_args->congestion_kb = (int)val;
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
  
+       fsc->mount_options->congestion_kb = (int)val;
         return 0;
  }
  
  static int congestion_kb_get(void *data, u64 *val)
  {
-       struct ceph_client *client = (struct ceph_client *)data;
-
-       if (client)
-               *val = (u64)client->mount_args->congestion_kb;
+       struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
  
+       *val = (u64)fsc->mount_options->congestion_kb;
         return 0;
  }
  
-
  DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
                         congestion_kb_set, "%llu\n");
  
-int __init ceph_debugfs_init(void)
-{
-       ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
-       if (!ceph_debugfs_dir)
-               return -ENOMEM;
-       return 0;
-}
  
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
  {
-       debugfs_remove(ceph_debugfs_dir);
+       dout("ceph_fs_debugfs_cleanup\n");
+       debugfs_remove(fsc->debugfs_bdi);
+       debugfs_remove(fsc->debugfs_congestion_kb);
+       debugfs_remove(fsc->debugfs_mdsmap);
+       debugfs_remove(fsc->debugfs_caps);
+       debugfs_remove(fsc->debugfs_mdsc);
+       debugfs_remove(fsc->debugfs_dentry_lru);
  }
  
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
  {
-       int ret = 0;
-       char name[80];
-
-       snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                client->monc.auth->global_id);
+       char name[100];
+       int err = -ENOMEM;
  
-       client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
-       if (!client->debugfs_dir)
-               goto out;
-
-       client->monc.debugfs_file = debugfs_create_file("monc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &monc_show_fops);
-       if (!client->monc.debugfs_file)
+       dout("ceph_fs_debugfs_init\n");
+       fsc->debugfs_congestion_kb =
+               debugfs_create_file("writeback_congestion_kb",
+                                   0600,
+                                   fsc->client->debugfs_dir,
+                                   fsc,
+                                   &congestion_kb_fops);
+       if (!fsc->debugfs_congestion_kb)
                 goto out;
  
-       client->mdsc.debugfs_file = debugfs_create_file("mdsc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &mdsc_show_fops);
-       if (!client->mdsc.debugfs_file)
-               goto out;
+       dout("a\n");
  
-       client->osdc.debugfs_file = debugfs_create_file("osdc",
-                                                     0600,
-                                                     client->debugfs_dir,
-                                                     client,
-                                                     &osdc_show_fops);
-       if (!client->osdc.debugfs_file)
+       snprintf(name, sizeof(name), "../../bdi/%s",
+                dev_name(fsc->backing_dev_info.dev));
+       fsc->debugfs_bdi =
+               debugfs_create_symlink("bdi",
+                                      fsc->client->debugfs_dir,
+                                      name);
+       if (!fsc->debugfs_bdi)
                 goto out;
  
-       client->debugfs_monmap = debugfs_create_file("monmap",
+       dout("b\n");
+       fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                         0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &monmap_show_fops);
-       if (!client->debugfs_monmap)
-               goto out;
-
-       client->debugfs_mdsmap = debugfs_create_file("mdsmap",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
+                                       fsc->client->debugfs_dir,
+                                       fsc,
                                         &mdsmap_show_fops);
-       if (!client->debugfs_mdsmap)
-               goto out;
-
-       client->debugfs_osdmap = debugfs_create_file("osdmap",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &osdmap_show_fops);
-       if (!client->debugfs_osdmap)
+       if (!fsc->debugfs_mdsmap)
                 goto out;
  
-       client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
-                                       0600,
-                                       client->debugfs_dir,
-                                       client,
-                                       &dentry_lru_show_fops);
-       if (!client->debugfs_dentry_lru)
+       dout("ca\n");
+       fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+                                               0600,
+                                               fsc->client->debugfs_dir,
+                                               fsc,
+                                               &mdsc_show_fops);
+       if (!fsc->debugfs_mdsc)
                 goto out;
  
-       client->debugfs_caps = debugfs_create_file("caps",
+       dout("da\n");
+       fsc->debugfs_caps = debugfs_create_file("caps",
                                                    0400,
-                                                  client->debugfs_dir,
-                                                  client,
+                                                  fsc->client->debugfs_dir,
+                                                  fsc,
                                                    &caps_show_fops);
-       if (!client->debugfs_caps)
+       if (!fsc->debugfs_caps)
                 goto out;
  
-       client->debugfs_congestion_kb =
-               debugfs_create_file("writeback_congestion_kb",
-                                   0600,
-                                   client->debugfs_dir,
-                                   client,
-                                   &congestion_kb_fops);
-       if (!client->debugfs_congestion_kb)
+       dout("ea\n");
+       fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+                                       0600,
+                                       fsc->client->debugfs_dir,
+                                       fsc,
+                                       &dentry_lru_show_fops);
+       if (!fsc->debugfs_dentry_lru)
                 goto out;
  
-       sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
-       client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
-                                                    name);
-
         return 0;
  
  out:
-       ceph_debugfs_client_cleanup(client);
-       return ret;
+       ceph_fs_debugfs_cleanup(fsc);
+       return err;
  }
  
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
-       debugfs_remove(client->debugfs_bdi);
-       debugfs_remove(client->debugfs_caps);
-       debugfs_remove(client->debugfs_dentry_lru);
-       debugfs_remove(client->debugfs_osdmap);
-       debugfs_remove(client->debugfs_mdsmap);
-       debugfs_remove(client->debugfs_monmap);
-       debugfs_remove(client->osdc.debugfs_file);
-       debugfs_remove(client->mdsc.debugfs_file);
-       debugfs_remove(client->monc.debugfs_file);
-       debugfs_remove(client->debugfs_congestion_kb);
-       debugfs_remove(client->debugfs_dir);
-}
  
  #else  /* CONFIG_DEBUG_FS */
  
-int __init ceph_debugfs_init(void)
-{
-       return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
  {
         return 0;
  }
  
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
  {
  }
  
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h

deleted file mode 100644 (file)

index 3d25415..0000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-
-#include <asm/unaligned.h>
-#include <linux/time.h>
-
-#include "types.h"
-
-/*
- * in all cases,
- *   void **p     pointer to position pointer
- *   void *end    pointer to end of buffer (last byte + 1)
- */
-
-static inline u64 ceph_decode_64(void **p)
-{
-       u64 v = get_unaligned_le64(*p);
-       *p += sizeof(u64);
-       return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
-       u32 v = get_unaligned_le32(*p);
-       *p += sizeof(u32);
-       return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
-       u16 v = get_unaligned_le16(*p);
-       *p += sizeof(u16);
-       return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
-       u8 v = *(u8 *)*p;
-       (*p)++;
-       return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
-       memcpy(pv, *p, n);
-       *p += n;
-}
-
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad)               \
-       do {                                            \
-               if (unlikely(*(p) + (n) > (end)))       \
-                       goto bad;                       \
-       } while (0)
-
-#define ceph_decode_64_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_decode_need(p, end, sizeof(u64), bad);     \
-               v = ceph_decode_64(p);                          \
-       } while (0)
-#define ceph_decode_32_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_decode_need(p, end, sizeof(u32), bad);     \
-               v = ceph_decode_32(p);                          \
-       } while (0)
-#define ceph_decode_16_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_decode_need(p, end, sizeof(u16), bad);     \
-               v = ceph_decode_16(p);                          \
-       } while (0)
-#define ceph_decode_8_safe(p, end, v, bad)                     \
-       do {                                                    \
-               ceph_decode_need(p, end, sizeof(u8), bad);      \
-               v = ceph_decode_8(p);                           \
-       } while (0)
-
-#define ceph_decode_copy_safe(p, end, pv, n, bad)              \
-       do {                                                    \
-               ceph_decode_need(p, end, n, bad);               \
-               ceph_decode_copy(p, pv, n);                     \
-       } while (0)
-
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
-                                       const struct ceph_timespec *tv)
-{
-       ts->tv_sec = le32_to_cpu(tv->tv_sec);
-       ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-                                       const struct timespec *ts)
-{
-       tv->tv_sec = cpu_to_le32(ts->tv_sec);
-       tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
-       __be16 ss_family = htons(a->in_addr.ss_family);
-       a->in_addr.ss_family = *(__u16 *)&ss_family;
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
-       __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
-       a->in_addr.ss_family = ntohs(ss_family);
-       WARN_ON(a->in_addr.ss_family == 512);
-}
-
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
-       put_unaligned_le64(v, (__le64 *)*p);
-       *p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
-       put_unaligned_le32(v, (__le32 *)*p);
-       *p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
-       put_unaligned_le16(v, (__le16 *)*p);
-       *p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
-       *(u8 *)*p = v;
-       (*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
-       memcpy(*p, s, len);
-       *p += len;
-}
-
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
-                                       u64 ino, const char *path)
-{
-       u32 len = path ? strlen(path) : 0;
-       BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
-       ceph_encode_8(p, 1);
-       ceph_encode_64(p, ino);
-       ceph_encode_32(p, len);
-       if (len)
-               memcpy(*p, path, len);
-       *p += len;
-}
-
-static inline void ceph_encode_string(void **p, void *end,
-                                     const char *s, u32 len)
-{
-       BUG_ON(*p + sizeof(len) + len > end);
-       ceph_encode_32(p, len);
-       if (len)
-               memcpy(*p, s, len);
-       *p += len;
-}
-
-#define ceph_encode_need(p, end, n, bad)               \
-       do {                                            \
-               if (unlikely(*(p) + (n) > (end)))       \
-                       goto bad;                       \
-       } while (0)
-
-#define ceph_encode_64_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_encode_need(p, end, sizeof(u64), bad);     \
-               ceph_encode_64(p, v);                           \
-       } while (0)
-#define ceph_encode_32_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_encode_need(p, end, sizeof(u32), bad);     \
-               ceph_encode_32(p, v);                   \
-       } while (0)
-#define ceph_encode_16_safe(p, end, v, bad)                    \
-       do {                                                    \
-               ceph_encode_need(p, end, sizeof(u16), bad);     \
-               ceph_encode_16(p, v);                   \
-       } while (0)
-
-#define ceph_encode_copy_safe(p, end, pv, n, bad)              \
-       do {                                                    \
-               ceph_encode_need(p, end, n, bad);               \
-               ceph_encode_copy(p, pv, n);                     \
-       } while (0)
-
-
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c

index a1986eb..e0a2dc6 100644 (file)
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/spinlock.h>
  #include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
  #include <linux/sched.h>
  
  #include "super.h"
+#include "mds_client.h"
  
  /*
   * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
   */
  static int __dcache_readdir(struct file *filp,
                             void *dirent, filldir_t filldir)
-               __releases(inode->i_lock)
-               __acquires(inode->i_lock)
  {
-       struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_file_info *fi = filp->private_data;
         struct dentry *parent = filp->f_dentry;
         struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
  
         atomic_inc(&dentry->d_count);
         spin_unlock(&dcache_lock);
-       spin_unlock(&inode->i_lock);
  
         dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
              dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
                 } else {
                         dput(last);
                 }
-               last = NULL;
         }
-
-       spin_lock(&inode->i_lock);
-       spin_lock(&dcache_lock);
-
         last = dentry;
  
         if (err < 0)
-               goto out_unlock;
+               goto out;
  
-       p = p->prev;
         filp->f_pos++;
  
         /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
-       if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
-               goto more;
-       dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
-       err = -EAGAIN;
+       if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+               dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+               err = -EAGAIN;
+               goto out;
+       }
+
+       spin_lock(&dcache_lock);
+       p = p->prev;    /* advance to next dentry */
+       goto more;
  
  out_unlock:
         spin_unlock(&dcache_lock);
-
-       if (last) {
-               spin_unlock(&inode->i_lock);
+out:
+       if (last)
                 dput(last);
-               spin_lock(&inode->i_lock);
-       }
-
         return err;
  }
  
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
         struct ceph_file_info *fi = filp->private_data;
         struct inode *inode = filp->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_inode_to_client(inode);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         unsigned frag = fpos_frag(filp->f_pos);
         int off = fpos_off(filp->f_pos);
         int err;
         u32 ftype;
         struct ceph_mds_reply_info_parsed *rinfo;
-       const int max_entries = client->mount_args->max_readdir;
-       const int max_bytes = client->mount_args->max_readdir_bytes;
+       const int max_entries = fsc->mount_options->max_readdir;
+       const int max_bytes = fsc->mount_options->max_readdir_bytes;
  
         dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
         if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
         /* can we use the dcache? */
         spin_lock(&inode->i_lock);
         if ((filp->f_pos == 2 || fi->dentry) &&
-           !ceph_test_opt(client, NOASYNCREADDIR) &&
+           !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
             ceph_snap(inode) != CEPH_SNAPDIR &&
             (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
             __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+               spin_unlock(&inode->i_lock);
                 err = __dcache_readdir(filp, dirent, filldir);
-               if (err != -EAGAIN) {
-                       spin_unlock(&inode->i_lock);
+               if (err != -EAGAIN)
                         return err;
-               }
+       } else {
+               spin_unlock(&inode->i_lock);
         }
-       spin_unlock(&inode->i_lock);
         if (fi->dentry) {
                 err = note_last_dentry(fi, fi->dentry->d_name.name,
                                        fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
  struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                   struct dentry *dentry, int err)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
         struct inode *parent = dentry->d_parent->d_inode;
  
         /* .snap dir? */
         if (err == -ENOENT &&
-           ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
             strcmp(dentry->d_name.name,
-                  client->mount_args->snapdir_name) == 0) {
+                  fsc->mount_options->snapdir_name) == 0) {
                 struct inode *inode = ceph_get_snapdir(parent);
                 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                      dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
  static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int op;
         int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                 spin_lock(&dir->i_lock);
                 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
                 if (strncmp(dentry->d_name.name,
-                           client->mount_args->snapdir_name,
+                           fsc->mount_options->snapdir_name,
                             dentry->d_name.len) &&
                     !is_root_ceph_dentry(dir, dentry) &&
                     (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
  static int ceph_mknod(struct inode *dir, struct dentry *dentry,
                       int mode, dev_t rdev)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
  static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                             const char *dest)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
  
  static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err = -EROFS;
         int op;
@@ -758,8 +749,8 @@ out:
  static int ceph_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
   */
  static int ceph_unlink(struct inode *dir, struct dentry *dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = dentry->d_inode;
         struct ceph_mds_request *req;
         int err = -EROFS;
@@ -854,8 +845,8 @@ out:
  static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
  {
-       struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
         struct ceph_inode_info *ci = ceph_inode(inode);
         int left;
  
-       if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+       if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                 return -EISDIR;
  
         if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
         dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_add_tail(&di->lru, &mdsc->dentry_lru);
                 mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
         dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
              dn->d_name.len, dn->d_name.name, di->offset);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_move_tail(&di->lru, &mdsc->dentry_lru);
                 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
         dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
              dn->d_name.len, dn->d_name.name);
         if (di) {
-               mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+               mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
                 spin_lock(&mdsc->dentry_lru_lock);
                 list_del_init(&di->lru);
                 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c

index e38423e..2297d94 100644 (file)
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/exportfs.h>
  #include <linux/slab.h>
  #include <asm/unaligned.h>
  
  #include "super.h"
+#include "mds_client.h"
  
  /*
   * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
  static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                       struct ceph_nfs_confh *cfh)
  {
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
         struct inode *inode;
         struct dentry *dentry;
         struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c

index 66e4da6..e77c28c 100644 (file)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/module.h>
  #include <linux/sched.h>
  #include <linux/slab.h>
  #include <linux/file.h>
@@ -38,8 +39,8 @@
  static struct ceph_mds_request *
  prepare_open_request(struct super_block *sb, int flags, int create_mode)
  {
-       struct ceph_client *client = ceph_sb_to_client(sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int want_auth = USE_ANY_MDS;
         int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
  int ceph_open(struct inode *inode, struct file *file)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         struct ceph_file_info *cf = file->private_data;
         struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                 struct nameidata *nd, int mode,
                                 int locked_dir)
  {
-       struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct file *file = nd->intent.open.file;
         struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
         struct ceph_mds_request *req;
@@ -269,163 +270,6 @@ int ceph_release(struct inode *inode, struct file *file)
         return 0;
  }
  
-/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
-                                           int num_pages,
-                                           loff_t off, size_t len)
-{
-       struct page **pages;
-       int rc;
-
-       pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
-
-       down_read(&current->mm->mmap_sem);
-       rc = get_user_pages(current, current->mm, (unsigned long)data,
-                           num_pages, 0, 0, pages, NULL);
-       up_read(&current->mm->mmap_sem);
-       if (rc < 0)
-               goto fail;
-       return pages;
-
-fail:
-       kfree(pages);
-       return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
-       int i;
-
-       for (i = 0; i < num_pages; i++)
-               put_page(pages[i]);
-       kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
-       int i;
-
-       for (i = 0; i < num_pages; i++)
-               __free_pages(pages[i], 0);
-       kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
-       struct page **pages;
-       int i;
-
-       pages = kmalloc(sizeof(*pages) * num_pages, flags);
-       if (!pages)
-               return ERR_PTR(-ENOMEM);
-       for (i = 0; i < num_pages; i++) {
-               pages[i] = __page_cache_alloc(flags);
-               if (pages[i] == NULL) {
-                       ceph_release_page_vector(pages, i);
-                       return ERR_PTR(-ENOMEM);
-               }
-       }
-       return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
-                                   const char __user *data,
-                                   loff_t off, size_t len)
-{
-       int i = 0;
-       int po = off & ~PAGE_CACHE_MASK;
-       int left = len;
-       int l, bad;
-
-       while (left > 0) {
-               l = min_t(int, PAGE_CACHE_SIZE-po, left);
-               bad = copy_from_user(page_address(pages[i]) + po, data, l);
-               if (bad == l)
-                       return -EFAULT;
-               data += l - bad;
-               left -= l - bad;
-               po += l - bad;
-               if (po == PAGE_CACHE_SIZE) {
-                       po = 0;
-                       i++;
-               }
-       }
-       return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
-                                   loff_t off, size_t len)
-{
-       int i = 0;
-       int po = off & ~PAGE_CACHE_MASK;
-       int left = len;
-       int l, bad;
-
-       while (left > 0) {
-               l = min_t(int, left, PAGE_CACHE_SIZE-po);
-               bad = copy_to_user(data, page_address(pages[i]) + po, l);
-               if (bad == l)
-                       return -EFAULT;
-               data += l - bad;
-               left -= l - bad;
-               if (po) {
-                       po += l - bad;
-                       if (po == PAGE_CACHE_SIZE)
-                               po = 0;
-               }
-               i++;
-       }
-       return len;
-}
-
-/*
- * Zero an extent within a page vector.  Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
-       int i = off >> PAGE_CACHE_SHIFT;
-
-       off &= ~PAGE_CACHE_MASK;
-
-       dout("zero_page_vector_page %u~%u\n", off, len);
-
-       /* leading partial page? */
-       if (off) {
-               int end = min((int)PAGE_CACHE_SIZE, off + len);
-               dout("zeroing %d %p head from %d\n", i, pages[i],
-                    (int)off);
-               zero_user_segment(pages[i], off, end);
-               len -= (end - off);
-               i++;
-       }
-       while (len >= PAGE_CACHE_SIZE) {
-               dout("zeroing %d %p len=%d\n", i, pages[i], len);
-               zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-               len -= PAGE_CACHE_SIZE;
-               i++;
-       }
-       /* trailing partial page? */
-       if (len) {
-               dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
-               zero_user_segment(pages[i], 0, len);
-       }
-}
-
-
  /*
   * Read a range of bytes striped over one or more objects.  Iterate over
   * objects we stripe over.  (That's not atomic, but good enough for now.)
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
                         struct page **pages, int num_pages,
                         int *checkeof)
  {
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         struct ceph_inode_info *ci = ceph_inode(inode);
         u64 pos, this_len;
         int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
  
  more:
         this_len = left;
-       ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
                                   &ci->i_layout, pos, &this_len,
                                   ci->i_truncate_seq,
                                   ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
  
                 if (read < pos - off) {
                         dout(" zero gap %llu to %llu\n", off + read, pos);
-                       zero_page_vector_range(page_off + read,
-                                              pos - off - read, pages);
+                       ceph_zero_page_vector_range(page_off + read,
+                                                   pos - off - read, pages);
                 }
                 pos += ret;
                 read = pos - off;
@@ -495,8 +339,8 @@ more:
                 /* was original extent fully inside i_size? */
                 if (pos + left <= inode->i_size) {
                         dout("zero tail\n");
-                       zero_page_vector_range(page_off + read, len - read,
-                                              pages);
+                       ceph_zero_page_vector_range(page_off + read, len - read,
+                                                   pages);
                         read = len;
                         goto out;
                 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
              (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
  
         if (file->f_flags & O_DIRECT) {
-               pages = get_direct_page_vector(data, num_pages, off, len);
+               pages = ceph_get_direct_page_vector(data, num_pages, off, len);
  
                 /*
                  * flush any page cache pages in this range.  this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
         ret = striped_read(inode, off, len, pages, num_pages, checkeof);
  
         if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
-               ret = copy_page_vector_to_user(pages, data, off, ret);
+               ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
         if (ret >= 0)
                 *poff = off + ret;
  
  done:
         if (file->f_flags & O_DIRECT)
-               put_page_vector(pages, num_pages);
+               ceph_put_page_vector(pages, num_pages);
         else
                 ceph_release_page_vector(pages, num_pages);
         dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_client *client = ceph_inode_to_client(inode);
+       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
         struct ceph_osd_request *req;
         struct page **pages;
         int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
          */
  more:
         len = left;
-       req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+       req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                     ceph_vino(inode), pos, &len,
                                     CEPH_OSD_OP_WRITE, flags,
                                     ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
         num_pages = calc_pages_for(pos, len);
  
         if (file->f_flags & O_DIRECT) {
-               pages = get_direct_page_vector(data, num_pages, pos, len);
+               pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
                 if (IS_ERR(pages)) {
                         ret = PTR_ERR(pages);
                         goto out;
@@ -673,7 +517,7 @@ more:
                         ret = PTR_ERR(pages);
                         goto out;
                 }
-               ret = copy_user_to_page_vector(pages, data, pos, len);
+               ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
                 if (ret < 0) {
                         ceph_release_page_vector(pages, num_pages);
                         goto out;
@@ -689,7 +533,7 @@ more:
         req->r_num_pages = num_pages;
         req->r_inode = inode;
  
-       ret = ceph_osdc_start_request(&client->osdc, req, false);
+       ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
         if (!ret) {
                 if (req->r_safe_callback) {
                         /*
@@ -701,11 +545,11 @@ more:
                         spin_unlock(&ci->i_unsafe_lock);
                         ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                 }
-               ret = ceph_osdc_wait_request(&client->osdc, req);
+               ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
         }
  
         if (file->f_flags & O_DIRECT)
-               put_page_vector(pages, num_pages);
+               ceph_put_page_vector(pages, num_pages);
         else if (file->f_flags & O_SYNC)
                 ceph_release_page_vector(pages, num_pages);
  
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
         struct ceph_file_info *fi = file->private_data;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_sb_to_client(inode->i_sb)->client->osdc;
         loff_t endoff = pos + iov->iov_len;
         int want, got = 0;
         int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c

index 62377ec..1d6a45b 100644 (file)
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/module.h>
  #include <linux/fs.h>
@@ -13,7 +13,8 @@
  #include <linux/pagevec.h>
  
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
  
  /*
   * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
          */
         if (ci->i_snap_realm) {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+                       ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                 struct ceph_snap_realm *realm = ci->i_snap_realm;
  
                 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
                 }
  
                 /* it may be better to set st_size in getattr instead? */
-               if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+               if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                         inode->i_size = ci->i_rbytes;
                 break;
         default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         struct inode *in = NULL;
         struct ceph_mds_reply_inode *ininfo;
         struct ceph_vino vino;
-       struct ceph_client *client = ceph_sb_to_client(sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
         int i = 0;
         int err = 0;
  
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
          */
         if (rinfo->head->is_dentry && !req->r_aborted &&
             (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
-                                              client->mount_args->snapdir_name,
+                                              fsc->mount_options->snapdir_name,
                                                req->r_dentry->d_name.len))) {
                 /*
                  * lookup link rename   : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
         struct inode *parent_inode = dentry->d_parent->d_inode;
         const unsigned int ia_valid = attr->ia_valid;
         struct ceph_mds_request *req;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
         int issued;
         int release = 0, dirtied = 0;
         int mask = 0;
@@ -1728,8 +1729,8 @@ out:
   */
  int ceph_do_getattr(struct inode *inode, int mask)
  {
-       struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req;
         int err;
  
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c

index 76e307d..8888c9b 100644 (file)
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
  #include <linux/in.h>
  
-#include "ioctl.h"
  #include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
  
  
  /*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_mds_request *req;
         struct ceph_ioctl_layout l;
         int err, i;
@@ -89,6 +91,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
         return err;
  }
  
+/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+       struct inode *inode = file->f_dentry->d_inode;
+       struct ceph_mds_request *req;
+       struct ceph_ioctl_layout l;
+       int err, i;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+       /* copy and validate */
+       if (copy_from_user(&l, arg, sizeof(l)))
+               return -EFAULT;
+
+       if ((l.object_size & ~PAGE_MASK) ||
+           (l.stripe_unit & ~PAGE_MASK) ||
+           !l.stripe_unit ||
+           (l.object_size &&
+               (unsigned)l.object_size % (unsigned)l.stripe_unit))
+               return -EINVAL;
+
+       /* make sure it's a valid data pool */
+       if (l.data_pool > 0) {
+               mutex_lock(&mdsc->mutex);
+               err = -EINVAL;
+               for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+                       if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+                               err = 0;
+                               break;
+                       }
+               mutex_unlock(&mdsc->mutex);
+               if (err)
+                       return err;
+       }
+
+       req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+                                      USE_AUTH_MDS);
+
+       if (IS_ERR(req))
+               return PTR_ERR(req);
+       req->r_inode = igrab(inode);
+
+       req->r_args.setlayout.layout.fl_stripe_unit =
+                       cpu_to_le32(l.stripe_unit);
+       req->r_args.setlayout.layout.fl_stripe_count =
+                       cpu_to_le32(l.stripe_count);
+       req->r_args.setlayout.layout.fl_object_size =
+                       cpu_to_le32(l.object_size);
+       req->r_args.setlayout.layout.fl_pg_pool =
+                       cpu_to_le32(l.data_pool);
+       req->r_args.setlayout.layout.fl_pg_preferred =
+                       cpu_to_le32(l.preferred_osd);
+
+       err = ceph_mdsc_do_request(mdsc, inode, req);
+       ceph_mdsc_put_request(req);
+       return err;
+}
+
  /*
   * Return object name, size/offset information, and location (OSD
   * number, network address) for a given file offset.
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
         struct ceph_ioctl_dataloc dl;
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
-       struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+       struct ceph_osd_client *osdc =
+               &ceph_sb_to_client(inode->i_sb)->client->osdc;
         u64 len = 1, olen;
         u64 tmp;
         struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         case CEPH_IOC_SET_LAYOUT:
                 return ceph_ioctl_set_layout(file, (void __user *)arg);
  
+       case CEPH_IOC_SET_LAYOUT_POLICY:
+               return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
         case CEPH_IOC_GET_DATALOC:
                 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
  
         case CEPH_IOC_LAZYIO:
                 return ceph_ioctl_lazyio(file);
         }
+
         return -ENOTTY;
  }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h

index 88451a3..a6ce54e 100644 (file)
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
  #include <linux/ioctl.h>
  #include <linux/types.h>
  
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
  
  /* just use u64 to align sanely on all archs */
  struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
                                    struct ceph_ioctl_layout)
  #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,          \
                                    struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5,   \
+                                  struct ceph_ioctl_layout)
  
  /*
   * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c

index ff4e753..40abde9 100644 (file)
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/file.h>
  #include <linux/namei.h>
  
  #include "super.h"
  #include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
  
  /**
   * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
  {
         struct inode *inode = file->f_dentry->d_inode;
         struct ceph_mds_client *mdsc =
-               &ceph_sb_to_client(inode->i_sb)->mdsc;
+               ceph_sb_to_client(inode->i_sb)->mdsc;
         struct ceph_mds_request *req;
         int err;
  
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
   * Encode the flock and fcntl locks for the given inode into the pagelist.
   * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
   * sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
   */
  int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                       int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
         struct file_lock *lock;
         struct ceph_filelock cephlock;
         int err = 0;
+       int seen_fcntl = 0;
+       int seen_flock = 0;
  
         dout("encoding %d flock and %d fcntl locks", num_flock_locks,
              num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                 goto fail;
         for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                 if (lock->fl_flags & FL_POSIX) {
+                       ++seen_fcntl;
+                       if (seen_fcntl > num_fcntl_locks) {
+                               err = -ENOSPC;
+                               goto fail;
+                       }
                         err = lock_to_ceph_filelock(lock, &cephlock);
                         if (err)
                                 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
                 goto fail;
         for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
                 if (lock->fl_flags & FL_FLOCK) {
+                       ++seen_flock;
+                       if (seen_flock > num_flock_locks) {
+                               err = -ENOSPC;
+                               goto fail;
+                       }
                         err = lock_to_ceph_filelock(lock, &cephlock);
                         if (err)
                                 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index fad95f8..3142b15 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
+#include <linux/fs.h>
  #include <linux/wait.h>
  #include <linux/slab.h>
  #include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
  #include <linux/smp_lock.h>
  
-#include "mds_client.h"
-#include "mon_client.h"
  #include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
  
  /*
   * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
              atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
         if (atomic_dec_and_test(&s->s_ref)) {
                 if (s->s_authorizer)
-                       s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
-                               s->s_mdsc->client->monc.auth, s->s_authorizer);
+                    s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+                            s->s_mdsc->fsc->client->monc.auth,
+                            s->s_authorizer);
                 kfree(s);
         }
  }
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
         s->s_seq = 0;
         mutex_init(&s->s_mutex);
  
-       ceph_con_init(mdsc->client->msgr, &s->s_con);
+       ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
         s->s_con.private = s;
         s->s_con.ops = &mds_con_ops;
         s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
         } else if (req->r_dentry) {
                 struct inode *dir = req->r_dentry->d_parent->d_inode;
  
-               if (dir->i_sb != mdsc->client->sb) {
+               if (dir->i_sb != mdsc->fsc->sb) {
                         /* not this fs! */
                         inode = req->r_dentry->d_inode;
                 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
         __ceph_remove_cap(cap);
         if (!__ceph_is_any_real_caps(ci)) {
                 struct ceph_mds_client *mdsc =
-                       &ceph_sb_to_client(inode->i_sb)->mdsc;
+                       ceph_sb_to_client(inode->i_sb)->mdsc;
  
                 spin_lock(&mdsc->cap_dirty_lock);
                 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
         struct ceph_msg *msg, *partial = NULL;
         struct ceph_mds_cap_release *head;
         int err = -ENOMEM;
-       int extra = mdsc->client->mount_args->cap_release_safety;
+       int extra = mdsc->fsc->mount_options->cap_release_safety;
         int num;
  
         dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
  
         /* insert trace into our cache */
         mutex_lock(&req->r_fill_mutex);
-       err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+       err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
         if (err == 0) {
                 if (result == 0 && rinfo->dir_nr)
                         ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
  
         if (recon_state->flock) {
                 int num_fcntl_locks, num_flock_locks;
-
-               lock_kernel();
-               ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-               rec.v2.flock_len = (2*sizeof(u32) +
-                                   (num_fcntl_locks+num_flock_locks) *
-                                   sizeof(struct ceph_filelock));
-
-               err = ceph_pagelist_append(pagelist, &rec, reclen);
-               if (!err)
-                       err = ceph_encode_locks(inode, pagelist,
-                                               num_fcntl_locks,
-                                               num_flock_locks);
-               unlock_kernel();
+               struct ceph_pagelist_cursor trunc_point;
+
+               ceph_pagelist_set_cursor(pagelist, &trunc_point);
+               do {
+                       lock_flocks();
+                       ceph_count_locks(inode, &num_fcntl_locks,
+                                        &num_flock_locks);
+                       rec.v2.flock_len = (2*sizeof(u32) +
+                                           (num_fcntl_locks+num_flock_locks) *
+                                           sizeof(struct ceph_filelock));
+                       unlock_flocks();
+
+                       /* pre-alloc pagelist */
+                       ceph_pagelist_truncate(pagelist, &trunc_point);
+                       err = ceph_pagelist_append(pagelist, &rec, reclen);
+                       if (!err)
+                               err = ceph_pagelist_reserve(pagelist,
+                                                           rec.v2.flock_len);
+
+                       /* encode locks */
+                       if (!err) {
+                               lock_flocks();
+                               err = ceph_encode_locks(inode,
+                                                       pagelist,
+                                                       num_fcntl_locks,
+                                                       num_flock_locks);
+                               unlock_flocks();
+                       }
+               } while (err == -ENOSPC);
         } else {
                 err = ceph_pagelist_append(pagelist, &rec, reclen);
         }
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                          struct ceph_mds_session *session,
                          struct ceph_msg *msg)
  {
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         struct inode *inode;
         struct ceph_inode_info *ci;
         struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
         schedule_delayed(mdsc);
  }
  
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
  
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
  {
-       mdsc->client = client;
+       struct ceph_mds_client *mdsc;
+
+       mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+       if (!mdsc)
+               return -ENOMEM;
+       mdsc->fsc = fsc;
+       fsc->mdsc = mdsc;
         mutex_init(&mdsc->mutex);
         mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
         if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
         INIT_LIST_HEAD(&mdsc->dentry_lru);
  
         ceph_caps_init(mdsc);
-       ceph_adjust_min_caps(mdsc, client->min_caps);
+       ceph_adjust_min_caps(mdsc, fsc->min_caps);
  
         return 0;
  }
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
  static void wait_requests(struct ceph_mds_client *mdsc)
  {
         struct ceph_mds_request *req;
-       struct ceph_client *client = mdsc->client;
+       struct ceph_fs_client *fsc = mdsc->fsc;
  
         mutex_lock(&mdsc->mutex);
         if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
  
                 dout("wait_requests waiting for requests\n");
                 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-                                   client->mount_args->mount_timeout * HZ);
+                                   fsc->client->options->mount_timeout * HZ);
  
                 /* tear down remaining requests */
                 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
  {
         u64 want_tid, want_flush;
  
-       if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                 return;
  
         dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
  {
         int i, n = 0;
  
-       if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+       if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                 return true;
  
         mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
  {
         struct ceph_mds_session *session;
         int i;
-       struct ceph_client *client = mdsc->client;
-       unsigned long timeout = client->mount_args->mount_timeout * HZ;
+       struct ceph_fs_client *fsc = mdsc->fsc;
+       unsigned long timeout = fsc->client->options->mount_timeout * HZ;
  
         dout("close_sessions\n");
  
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
         dout("stopped\n");
  }
  
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
  {
         dout("stop\n");
         cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
         ceph_caps_finalize(mdsc);
  }
  
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+       struct ceph_mds_client *mdsc = fsc->mdsc;
+
+       ceph_mdsc_stop(mdsc);
+       fsc->mdsc = NULL;
+       kfree(mdsc);
+}
+
  
  /*
   * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
  
         ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
         ceph_decode_copy(&p, &fsid, sizeof(fsid));
-       if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+       if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
                 return;
         epoch = ceph_decode_32(&p);
         maplen = ceph_decode_32(&p);
         dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
  
         /* do we need it? */
-       ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+       ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
         mutex_lock(&mdsc->mutex);
         if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
                 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
         } else {
                 mdsc->mdsmap = newmap;  /* first mds map */
         }
-       mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+       mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
  
         __wake_requests(mdsc, &mdsc->waiting_for_map);
  
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
         int ret = 0;
  
         if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
  
         return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
  }
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
  {
         struct ceph_mds_session *s = con->private;
         struct ceph_mds_client *mdsc = s->s_mdsc;
-       struct ceph_auth_client *ac = mdsc->client->monc.auth;
+       struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
  
         if (ac->ops->invalidate_authorizer)
                 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
  
-       return ceph_monc_validate_auth(&mdsc->client->monc);
+       return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
  }
  
  static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
         .peer_reset = peer_reset,
  };
  
-
-
-
  /* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h

index c98267c..d66d63c 100644 (file)
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
  #include <linux/rbtree.h>
  #include <linux/spinlock.h>
  
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
  
  /*
   * Some lock dependencies:
@@ -26,7 +26,7 @@
   *
   */
  
-struct ceph_client;
+struct ceph_fs_client;
  struct ceph_cap;
  
  /*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
   * mds client state
   */
  struct ceph_mds_client {
-       struct ceph_client      *client;
+       struct ceph_fs_client  *fsc;
         struct mutex            mutex;         /* all nested structures */
  
         struct ceph_mdsmap      *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
         int             caps_avail_count;    /* unused, unreserved */
         int             caps_min_count;      /* keep at least this many
                                                 (unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
-       struct dentry     *debugfs_file;
-#endif
-
         spinlock_t        dentry_lru_lock;
         struct list_head  dentry_lru;
         int               num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
  extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
                              struct ceph_msg *msg, int mds);
  
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
-                          struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
  extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
  
  extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
  
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c

index 040be6d..73b7d44 100644 (file)
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/bug.h>
  #include <linux/err.h>
@@ -6,9 +6,9 @@
  #include <linux/slab.h>
  #include <linux/types.h>
  
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
  
  #include "super.h"
  
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                 }
  
                 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
-                    i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+                    i+1, n, global_id, mds, inc,
+                    ceph_pr_addr(&addr.in_addr),
                      ceph_mds_state_name(state));
                 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
                         m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h

deleted file mode 100644 (file)

index 4c5cb08..0000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include "types.h"
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
-       u64 global_id;
-       struct ceph_entity_addr addr;
-       s32 state;
-       int num_export_targets;
-       bool laggy;
-       u32 *export_targets;
-};
-
-struct ceph_mdsmap {
-       u32 m_epoch, m_client_epoch, m_last_failure;
-       u32 m_root;
-       u32 m_session_timeout;          /* seconds */
-       u32 m_session_autoclose;        /* seconds */
-       u64 m_max_file_size;
-       u32 m_max_mds;                  /* size of m_addr, m_state arrays */
-       struct ceph_mds_info *m_info;
-
-       /* which object pools file data can be stored in */
-       int m_num_data_pg_pools;
-       u32 *m_data_pg_pools;
-       u32 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
-       if (w >= m->m_max_mds)
-               return NULL;
-       return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
-       BUG_ON(w < 0);
-       if (w >= m->m_max_mds)
-               return CEPH_MDS_STATE_DNE;
-       return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
-       if (w >= 0 && w < m->m_max_mds)
-               return m->m_info[w].laggy;
-       return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c

deleted file mode 100644 (file)

index 2502d76..0000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system.  The messenger provides ordered and reliable
- * delivery.  We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error).  Acks allow sent messages to be discarded by
- * the sender.
- */
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-
-
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
-       int i;
-       char *s;
-       struct sockaddr_in *in4 = (void *)ss;
-       struct sockaddr_in6 *in6 = (void *)ss;
-
-       spin_lock(&addr_str_lock);
-       i = last_addr_str++;
-       if (last_addr_str == MAX_ADDR_STR)
-               last_addr_str = 0;
-       spin_unlock(&addr_str_lock);
-       s = addr_str[i];
-
-       switch (ss->ss_family) {
-       case AF_INET:
-               snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
-                        (unsigned int)ntohs(in4->sin_port));
-               break;
-
-       case AF_INET6:
-               snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
-                        (unsigned int)ntohs(in6->sin6_port));
-               break;
-
-       default:
-               sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
-       }
-
-       return s;
-}
-
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
-       memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-       ceph_encode_addr(&msgr->my_enc_addr);
-}
-
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-
-int __init ceph_msgr_init(void)
-{
-       ceph_msgr_wq = create_workqueue("ceph-msgr");
-       if (IS_ERR(ceph_msgr_wq)) {
-               int ret = PTR_ERR(ceph_msgr_wq);
-               pr_err("msgr_init failed to create workqueue: %d\n", ret);
-               ceph_msgr_wq = NULL;
-               return ret;
-       }
-       return 0;
-}
-
-void ceph_msgr_exit(void)
-{
-       destroy_workqueue(ceph_msgr_wq);
-}
-
-void ceph_msgr_flush(void)
-{
-       flush_workqueue(ceph_msgr_wq);
-}
-
-
-/*
- * socket callback functions
- */
-
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
-       struct ceph_connection *con =
-               (struct ceph_connection *)sk->sk_user_data;
-       if (sk->sk_state != TCP_CLOSE_WAIT) {
-               dout("ceph_data_ready on %p state = %lu, queueing work\n",
-                    con, con->state);
-               queue_con(con);
-       }
-}
-
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
-       struct ceph_connection *con =
-               (struct ceph_connection *)sk->sk_user_data;
-
-       /* only queue to workqueue if there is data we want to write. */
-       if (test_bit(WRITE_PENDING, &con->state)) {
-               dout("ceph_write_space %p queueing write work\n", con);
-               queue_con(con);
-       } else {
-               dout("ceph_write_space %p nothing to write\n", con);
-       }
-
-       /* since we have our own write_space, clear the SOCK_NOSPACE flag */
-       clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
-       struct ceph_connection *con =
-               (struct ceph_connection *)sk->sk_user_data;
-
-       dout("ceph_state_change %p state = %lu sk_state = %u\n",
-            con, con->state, sk->sk_state);
-
-       if (test_bit(CLOSED, &con->state))
-               return;
-
-       switch (sk->sk_state) {
-       case TCP_CLOSE:
-               dout("ceph_state_change TCP_CLOSE\n");
-       case TCP_CLOSE_WAIT:
-               dout("ceph_state_change TCP_CLOSE_WAIT\n");
-               if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
-                       if (test_bit(CONNECTING, &con->state))
-                               con->error_msg = "connection failed";
-                       else
-                               con->error_msg = "socket closed";
-                       queue_con(con);
-               }
-               break;
-       case TCP_ESTABLISHED:
-               dout("ceph_state_change TCP_ESTABLISHED\n");
-               queue_con(con);
-               break;
-       }
-}
-
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
-                              struct ceph_connection *con)
-{
-       struct sock *sk = sock->sk;
-       sk->sk_user_data = (void *)con;
-       sk->sk_data_ready = ceph_data_ready;
-       sk->sk_write_space = ceph_write_space;
-       sk->sk_state_change = ceph_state_change;
-}
-
-
-/*
- * socket helpers
- */
-
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
-       struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
-       struct socket *sock;
-       int ret;
-
-       BUG_ON(con->sock);
-       ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
-                              IPPROTO_TCP, &sock);
-       if (ret)
-               return ERR_PTR(ret);
-       con->sock = sock;
-       sock->sk->sk_allocation = GFP_NOFS;
-
-#ifdef CONFIG_LOCKDEP
-       lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-
-       set_sock_callbacks(sock, con);
-
-       dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-
-       ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
-                                O_NONBLOCK);
-       if (ret == -EINPROGRESS) {
-               dout("connect %s EINPROGRESS sk_state = %u\n",
-                    pr_addr(&con->peer_addr.in_addr),
-                    sock->sk->sk_state);
-               ret = 0;
-       }
-       if (ret < 0) {
-               pr_err("connect %s error %d\n",
-                      pr_addr(&con->peer_addr.in_addr), ret);
-               sock_release(sock);
-               con->sock = NULL;
-               con->error_msg = "connect error";
-       }
-
-       if (ret < 0)
-               return ERR_PTR(ret);
-       return sock;
-}
-
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-       struct kvec iov = {buf, len};
-       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-       return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-                    size_t kvlen, size_t len, int more)
-{
-       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
-       if (more)
-               msg.msg_flags |= MSG_MORE;
-       else
-               msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-       return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-
-
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
-       int rc;
-
-       dout("con_close_socket on %p sock %p\n", con, con->sock);
-       if (!con->sock)
-               return 0;
-       set_bit(SOCK_CLOSED, &con->state);
-       rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
-       sock_release(con->sock);
-       con->sock = NULL;
-       clear_bit(SOCK_CLOSED, &con->state);
-       return rc;
-}
-
-/*
- * Reset a connection.  Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
-       list_del_init(&msg->list_head);
-       ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
-       while (!list_empty(head)) {
-               struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
-                                                       list_head);
-               ceph_msg_remove(msg);
-       }
-}
-
-static void reset_connection(struct ceph_connection *con)
-{
-       /* reset connection, out_queue, msg_ and connect_seq */
-       /* discard existing out_queue and msg_seq */
-       ceph_msg_remove_list(&con->out_queue);
-       ceph_msg_remove_list(&con->out_sent);
-
-       if (con->in_msg) {
-               ceph_msg_put(con->in_msg);
-               con->in_msg = NULL;
-       }
-
-       con->connect_seq = 0;
-       con->out_seq = 0;
-       if (con->out_msg) {
-               ceph_msg_put(con->out_msg);
-               con->out_msg = NULL;
-       }
-       con->out_keepalive_pending = false;
-       con->in_seq = 0;
-       con->in_seq_acked = 0;
-}
-
-/*
- * mark a peer down.  drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
-       dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
-       set_bit(CLOSED, &con->state);  /* in case there's queued work */
-       clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
-       clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
-       clear_bit(KEEPALIVE_PENDING, &con->state);
-       clear_bit(WRITE_PENDING, &con->state);
-       mutex_lock(&con->mutex);
-       reset_connection(con);
-       con->peer_global_seq = 0;
-       cancel_delayed_work(&con->work);
-       mutex_unlock(&con->mutex);
-       queue_con(con);
-}
-
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
-       dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
-       set_bit(OPENING, &con->state);
-       clear_bit(CLOSED, &con->state);
-       memcpy(&con->peer_addr, addr, sizeof(*addr));
-       con->delay = 0;      /* reset backoff memory */
-       queue_con(con);
-}
-
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
-       return con->connect_seq > 0;
-}
-
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
-       dout("con_get %p nref = %d -> %d\n", con,
-            atomic_read(&con->nref), atomic_read(&con->nref) + 1);
-       if (atomic_inc_not_zero(&con->nref))
-               return con;
-       return NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
-       dout("con_put %p nref = %d -> %d\n", con,
-            atomic_read(&con->nref), atomic_read(&con->nref) - 1);
-       BUG_ON(atomic_read(&con->nref) == 0);
-       if (atomic_dec_and_test(&con->nref)) {
-               BUG_ON(con->sock);
-               kfree(con);
-       }
-}
-
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
-       dout("con_init %p\n", con);
-       memset(con, 0, sizeof(*con));
-       atomic_set(&con->nref, 1);
-       con->msgr = msgr;
-       mutex_init(&con->mutex);
-       INIT_LIST_HEAD(&con->out_queue);
-       INIT_LIST_HEAD(&con->out_sent);
-       INIT_DELAYED_WORK(&con->work, con_work);
-}
-
-
-/*
- * We maintain a global counter to order connection attempts.  Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
-       u32 ret;
-
-       spin_lock(&msgr->global_seq_lock);
-       if (msgr->global_seq < gt)
-               msgr->global_seq = gt;
-       ret = ++msgr->global_seq;
-       spin_unlock(&msgr->global_seq_lock);
-       return ret;
-}
-
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
-       struct ceph_msg *m = con->out_msg;
-
-       dout("prepare_write_message_footer %p\n", con);
-       con->out_kvec_is_msg = true;
-       con->out_kvec[v].iov_base = &m->footer;
-       con->out_kvec[v].iov_len = sizeof(m->footer);
-       con->out_kvec_bytes += sizeof(m->footer);
-       con->out_kvec_left++;
-       con->out_more = m->more_to_follow;
-       con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-       struct ceph_msg *m;
-       int v = 0;
-
-       con->out_kvec_bytes = 0;
-       con->out_kvec_is_msg = true;
-       con->out_msg_done = false;
-
-       /* Sneak an ack in there first?  If we can get it into the same
-        * TCP packet that's a good thing. */
-       if (con->in_seq > con->in_seq_acked) {
-               con->in_seq_acked = con->in_seq;
-               con->out_kvec[v].iov_base = &tag_ack;
-               con->out_kvec[v++].iov_len = 1;
-               con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-               con->out_kvec[v].iov_base = &con->out_temp_ack;
-               con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
-               con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-       }
-
-       m = list_first_entry(&con->out_queue,
-                      struct ceph_msg, list_head);
-       con->out_msg = m;
-       if (test_bit(LOSSYTX, &con->state)) {
-               list_del_init(&m->list_head);
-       } else {
-               /* put message on sent list */
-               ceph_msg_get(m);
-               list_move_tail(&m->list_head, &con->out_sent);
-       }
-
-       /*
-        * only assign outgoing seq # if we haven't sent this message
-        * yet.  if it is requeued, resend with it's original seq.
-        */
-       if (m->needs_out_seq) {
-               m->hdr.seq = cpu_to_le64(++con->out_seq);
-               m->needs_out_seq = false;
-       }
-
-       dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
-            m, con->out_seq, le16_to_cpu(m->hdr.type),
-            le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-            le32_to_cpu(m->hdr.data_len),
-            m->nr_pages);
-       BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-
-       /* tag + hdr + front + middle */
-       con->out_kvec[v].iov_base = &tag_msg;
-       con->out_kvec[v++].iov_len = 1;
-       con->out_kvec[v].iov_base = &m->hdr;
-       con->out_kvec[v++].iov_len = sizeof(m->hdr);
-       con->out_kvec[v++] = m->front;
-       if (m->middle)
-               con->out_kvec[v++] = m->middle->vec;
-       con->out_kvec_left = v;
-       con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
-               (m->middle ? m->middle->vec.iov_len : 0);
-       con->out_kvec_cur = con->out_kvec;
-
-       /* fill in crc (except data pages), footer */
-       con->out_msg->hdr.crc =
-               cpu_to_le32(crc32c(0, (void *)&m->hdr,
-                                     sizeof(m->hdr) - sizeof(m->hdr.crc)));
-       con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
-       con->out_msg->footer.front_crc =
-               cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
-       if (m->middle)
-               con->out_msg->footer.middle_crc =
-                       cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
-                                          m->middle->vec.iov_len));
-       else
-               con->out_msg->footer.middle_crc = 0;
-       con->out_msg->footer.data_crc = 0;
-       dout("prepare_write_message front_crc %u data_crc %u\n",
-            le32_to_cpu(con->out_msg->footer.front_crc),
-            le32_to_cpu(con->out_msg->footer.middle_crc));
-
-       /* is there a data payload? */
-       if (le32_to_cpu(m->hdr.data_len) > 0) {
-               /* initialize page iterator */
-               con->out_msg_pos.page = 0;
-               con->out_msg_pos.page_pos =
-                       le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
-               con->out_msg_pos.data_pos = 0;
-               con->out_msg_pos.did_page_crc = 0;
-               con->out_more = 1;  /* data + footer will follow */
-       } else {
-               /* no, queue up footer too and be done */
-               prepare_write_message_footer(con, v);
-       }
-
-       set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-       dout("prepare_write_ack %p %llu -> %llu\n", con,
-            con->in_seq_acked, con->in_seq);
-       con->in_seq_acked = con->in_seq;
-
-       con->out_kvec[0].iov_base = &tag_ack;
-       con->out_kvec[0].iov_len = 1;
-       con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-       con->out_kvec[1].iov_base = &con->out_temp_ack;
-       con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
-       con->out_kvec_left = 2;
-       con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
-       con->out_kvec_cur = con->out_kvec;
-       con->out_more = 1;  /* more will follow.. eventually.. */
-       set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-       dout("prepare_write_keepalive %p\n", con);
-       con->out_kvec[0].iov_base = &tag_keepalive;
-       con->out_kvec[0].iov_len = 1;
-       con->out_kvec_left = 1;
-       con->out_kvec_bytes = 1;
-       con->out_kvec_cur = con->out_kvec;
-       set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Connection negotiation.
- */
-
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
-       void *auth_buf;
-       int auth_len = 0;
-       int auth_protocol = 0;
-
-       mutex_unlock(&con->mutex);
-       if (con->ops->get_authorizer)
-               con->ops->get_authorizer(con, &auth_buf, &auth_len,
-                                        &auth_protocol, &con->auth_reply_buf,
-                                        &con->auth_reply_buf_len,
-                                        con->auth_retry);
-       mutex_lock(&con->mutex);
-
-       con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
-       con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-
-       con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
-       con->out_kvec[con->out_kvec_left].iov_len = auth_len;
-       con->out_kvec_left++;
-       con->out_kvec_bytes += auth_len;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
-                                struct ceph_connection *con)
-{
-       int len = strlen(CEPH_BANNER);
-
-       con->out_kvec[0].iov_base = CEPH_BANNER;
-       con->out_kvec[0].iov_len = len;
-       con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-       con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-       con->out_kvec_left = 2;
-       con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
-       con->out_kvec_cur = con->out_kvec;
-       con->out_more = 0;
-       set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect(struct ceph_messenger *msgr,
-                                 struct ceph_connection *con,
-                                 int after_banner)
-{
-       unsigned global_seq = get_global_seq(con->msgr, 0);
-       int proto;
-
-       switch (con->peer_name.type) {
-       case CEPH_ENTITY_TYPE_MON:
-               proto = CEPH_MONC_PROTOCOL;
-               break;
-       case CEPH_ENTITY_TYPE_OSD:
-               proto = CEPH_OSDC_PROTOCOL;
-               break;
-       case CEPH_ENTITY_TYPE_MDS:
-               proto = CEPH_MDSC_PROTOCOL;
-               break;
-       default:
-               BUG();
-       }
-
-       dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-            con->connect_seq, global_seq, proto);
-
-       con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
-       con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-       con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-       con->out_connect.global_seq = cpu_to_le32(global_seq);
-       con->out_connect.protocol_version = cpu_to_le32(proto);
-       con->out_connect.flags = 0;
-
-       if (!after_banner) {
-               con->out_kvec_left = 0;
-               con->out_kvec_bytes = 0;
-       }
-       con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
-       con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
-       con->out_kvec_left++;
-       con->out_kvec_bytes += sizeof(con->out_connect);
-       con->out_kvec_cur = con->out_kvec;
-       con->out_more = 0;
-       set_bit(WRITE_PENDING, &con->state);
-
-       prepare_connect_authorizer(con);
-}
-
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-       int ret;
-
-       dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-       while (con->out_kvec_bytes > 0) {
-               ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-                                      con->out_kvec_left, con->out_kvec_bytes,
-                                      con->out_more);
-               if (ret <= 0)
-                       goto out;
-               con->out_kvec_bytes -= ret;
-               if (con->out_kvec_bytes == 0)
-                       break;            /* done */
-               while (ret > 0) {
-                       if (ret >= con->out_kvec_cur->iov_len) {
-                               ret -= con->out_kvec_cur->iov_len;
-                               con->out_kvec_cur++;
-                               con->out_kvec_left--;
-                       } else {
-                               con->out_kvec_cur->iov_len -= ret;
-                               con->out_kvec_cur->iov_base += ret;
-                               ret = 0;
-                               break;
-                       }
-               }
-       }
-       con->out_kvec_left = 0;
-       con->out_kvec_is_msg = false;
-       ret = 1;
-out:
-       dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-            con->out_kvec_bytes, con->out_kvec_left, ret);
-       return ret;  /* done! */
-}
-
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
-       struct ceph_msg *msg = con->out_msg;
-       unsigned data_len = le32_to_cpu(msg->hdr.data_len);
-       size_t len;
-       int crc = con->msgr->nocrc;
-       int ret;
-
-       dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-            con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
-            con->out_msg_pos.page_pos);
-
-       while (con->out_msg_pos.page < con->out_msg->nr_pages) {
-               struct page *page = NULL;
-               void *kaddr = NULL;
-
-               /*
-                * if we are calculating the data crc (the default), we need
-                * to map the page.  if our pages[] has been revoked, use the
-                * zero page.
-                */
-               if (msg->pages) {
-                       page = msg->pages[con->out_msg_pos.page];
-                       if (crc)
-                               kaddr = kmap(page);
-               } else if (msg->pagelist) {
-                       page = list_first_entry(&msg->pagelist->head,
-                                               struct page, lru);
-                       if (crc)
-                               kaddr = kmap(page);
-               } else {
-                       page = con->msgr->zero_page;
-                       if (crc)
-                               kaddr = page_address(con->msgr->zero_page);
-               }
-               len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
-                         (int)(data_len - con->out_msg_pos.data_pos));
-               if (crc && !con->out_msg_pos.did_page_crc) {
-                       void *base = kaddr + con->out_msg_pos.page_pos;
-                       u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-
-                       BUG_ON(kaddr == NULL);
-                       con->out_msg->footer.data_crc =
-                               cpu_to_le32(crc32c(tmpcrc, base, len));
-                       con->out_msg_pos.did_page_crc = 1;
-               }
-
-               ret = kernel_sendpage(con->sock, page,
-                                     con->out_msg_pos.page_pos, len,
-                                     MSG_DONTWAIT | MSG_NOSIGNAL |
-                                     MSG_MORE);
-
-               if (crc && (msg->pages || msg->pagelist))
-                       kunmap(page);
-
-               if (ret <= 0)
-                       goto out;
-
-               con->out_msg_pos.data_pos += ret;
-               con->out_msg_pos.page_pos += ret;
-               if (ret == len) {
-                       con->out_msg_pos.page_pos = 0;
-                       con->out_msg_pos.page++;
-                       con->out_msg_pos.did_page_crc = 0;
-                       if (msg->pagelist)
-                               list_move_tail(&page->lru,
-                                              &msg->pagelist->head);
-               }
-       }
-
-       dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-
-       /* prepare and queue up footer, too */
-       if (!crc)
-               con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-       con->out_kvec_bytes = 0;
-       con->out_kvec_left = 0;
-       con->out_kvec_cur = con->out_kvec;
-       prepare_write_message_footer(con, 0);
-       ret = 1;
-out:
-       return ret;
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-       int ret;
-
-       while (con->out_skip > 0) {
-               struct kvec iov = {
-                       .iov_base = page_address(con->msgr->zero_page),
-                       .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
-               };
-
-               ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
-               if (ret <= 0)
-                       goto out;
-               con->out_skip -= ret;
-       }
-       ret = 1;
-out:
-       return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-       dout("prepare_read_banner %p\n", con);
-       con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-       dout("prepare_read_connect %p\n", con);
-       con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-       dout("prepare_read_ack %p\n", con);
-       con->in_base_pos = 0;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-       dout("prepare_read_tag %p\n", con);
-       con->in_base_pos = 0;
-       con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-       dout("prepare_read_message %p\n", con);
-       BUG_ON(con->in_msg != NULL);
-       con->in_base_pos = 0;
-       con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-       return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-                       int *to, int size, void *object)
-{
-       *to += size;
-       while (con->in_base_pos < *to) {
-               int left = *to - con->in_base_pos;
-               int have = size - left;
-               int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-               if (ret <= 0)
-                       return ret;
-               con->in_base_pos += ret;
-       }
-       return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-       int ret, to = 0;
-
-       dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-       /* peer's banner */
-       ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
-       if (ret <= 0)
-               goto out;
-       ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
-                          &con->actual_peer_addr);
-       if (ret <= 0)
-               goto out;
-       ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
-                          &con->peer_addr_for_me);
-       if (ret <= 0)
-               goto out;
-out:
-       return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-       int ret, to = 0;
-
-       dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-       ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
-       if (ret <= 0)
-               goto out;
-       ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
-                          con->auth_reply_buf);
-       if (ret <= 0)
-               goto out;
-
-       dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-            con, (int)con->in_reply.tag,
-            le32_to_cpu(con->in_reply.connect_seq),
-            le32_to_cpu(con->in_reply.global_seq));
-out:
-       return ret;
-
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-       if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-               pr_err("connect to %s got bad banner\n",
-                      pr_addr(&con->peer_addr.in_addr));
-               con->error_msg = "protocol error, bad banner";
-               return -1;
-       }
-       return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
-       switch (ss->ss_family) {
-       case AF_INET:
-               return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
-       case AF_INET6:
-               return
-                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
-                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
-                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
-                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
-       }
-       return false;
-}
-
-static int addr_port(struct sockaddr_storage *ss)
-{
-       switch (ss->ss_family) {
-       case AF_INET:
-               return ntohs(((struct sockaddr_in *)ss)->sin_port);
-       case AF_INET6:
-               return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
-       }
-       return 0;
-}
-
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
-       switch (ss->ss_family) {
-       case AF_INET:
-               ((struct sockaddr_in *)ss)->sin_port = htons(p);
-       case AF_INET6:
-               ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
-       }
-}
-
-/*
- * Parse an ip[:port] list into an addr array.  Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
-                  struct ceph_entity_addr *addr,
-                  int max_count, int *count)
-{
-       int i;
-       const char *p = c;
-
-       dout("parse_ips on '%.*s'\n", (int)(end-c), c);
-       for (i = 0; i < max_count; i++) {
-               const char *ipend;
-               struct sockaddr_storage *ss = &addr[i].in_addr;
-               struct sockaddr_in *in4 = (void *)ss;
-               struct sockaddr_in6 *in6 = (void *)ss;
-               int port;
-               char delim = ',';
-
-               if (*p == '[') {
-                       delim = ']';
-                       p++;
-               }
-
-               memset(ss, 0, sizeof(*ss));
-               if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-                            delim, &ipend))
-                       ss->ss_family = AF_INET;
-               else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-                                 delim, &ipend))
-                       ss->ss_family = AF_INET6;
-               else
-                       goto bad;
-               p = ipend;
-
-               if (delim == ']') {
-                       if (*p != ']') {
-                               dout("missing matching ']'\n");
-                               goto bad;
-                       }
-                       p++;
-               }
-
-               /* port? */
-               if (p < end && *p == ':') {
-                       port = 0;
-                       p++;
-                       while (p < end && *p >= '0' && *p <= '9') {
-                               port = (port * 10) + (*p - '0');
-                               p++;
-                       }
-                       if (port > 65535 || port == 0)
-                               goto bad;
-               } else {
-                       port = CEPH_MON_PORT;
-               }
-
-               addr_set_port(ss, port);
-
-               dout("parse_ips got %s\n", pr_addr(ss));
-
-               if (p == end)
-                       break;
-               if (*p != ',')
-                       goto bad;
-               p++;
-       }
-
-       if (p != end)
-               goto bad;
-
-       if (count)
-               *count = i + 1;
-       return 0;
-
-bad:
-       pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
-       return -EINVAL;
-}
-
-static int process_banner(struct ceph_connection *con)
-{
-       dout("process_banner on %p\n", con);
-
-       if (verify_hello(con) < 0)
-               return -1;
-
-       ceph_decode_addr(&con->actual_peer_addr);
-       ceph_decode_addr(&con->peer_addr_for_me);
-
-       /*
-        * Make sure the other end is who we wanted.  note that the other
-        * end may not yet know their ip address, so if it's 0.0.0.0, give
-        * them the benefit of the doubt.
-        */
-       if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-                  sizeof(con->peer_addr)) != 0 &&
-           !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
-             con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-               pr_warning("wrong peer, want %s/%d, got %s/%d\n",
-                          pr_addr(&con->peer_addr.in_addr),
-                          (int)le32_to_cpu(con->peer_addr.nonce),
-                          pr_addr(&con->actual_peer_addr.in_addr),
-                          (int)le32_to_cpu(con->actual_peer_addr.nonce));
-               con->error_msg = "wrong peer at address";
-               return -1;
-       }
-
-       /*
-        * did we learn our address?
-        */
-       if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
-               int port = addr_port(&con->msgr->inst.addr.in_addr);
-
-               memcpy(&con->msgr->inst.addr.in_addr,
-                      &con->peer_addr_for_me.in_addr,
-                      sizeof(con->peer_addr_for_me.in_addr));
-               addr_set_port(&con->msgr->inst.addr.in_addr, port);
-               encode_my_addr(con->msgr);
-               dout("process_banner learned my addr is %s\n",
-                    pr_addr(&con->msgr->inst.addr.in_addr));
-       }
-
-       set_bit(NEGOTIATING, &con->state);
-       prepare_read_connect(con);
-       return 0;
-}
-
-static void fail_protocol(struct ceph_connection *con)
-{
-       reset_connection(con);
-       set_bit(CLOSED, &con->state);  /* in case there's queued work */
-
-       mutex_unlock(&con->mutex);
-       if (con->ops->bad_proto)
-               con->ops->bad_proto(con);
-       mutex_lock(&con->mutex);
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-       u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-       u64 req_feat = CEPH_FEATURE_REQUIRED;
-       u64 server_feat = le64_to_cpu(con->in_reply.features);
-
-       dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-       switch (con->in_reply.tag) {
-       case CEPH_MSGR_TAG_FEATURES:
-               pr_err("%s%lld %s feature set mismatch,"
-                      " my %llx < server's %llx, missing %llx\n",
-                      ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr),
-                      sup_feat, server_feat, server_feat & ~sup_feat);
-               con->error_msg = "missing required protocol features";
-               fail_protocol(con);
-               return -1;
-
-       case CEPH_MSGR_TAG_BADPROTOVER:
-               pr_err("%s%lld %s protocol version mismatch,"
-                      " my %d != server's %d\n",
-                      ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr),
-                      le32_to_cpu(con->out_connect.protocol_version),
-                      le32_to_cpu(con->in_reply.protocol_version));
-               con->error_msg = "protocol version mismatch";
-               fail_protocol(con);
-               return -1;
-
-       case CEPH_MSGR_TAG_BADAUTHORIZER:
-               con->auth_retry++;
-               dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-                    con->auth_retry);
-               if (con->auth_retry == 2) {
-                       con->error_msg = "connect authorization failure";
-                       reset_connection(con);
-                       set_bit(CLOSED, &con->state);
-                       return -1;
-               }
-               con->auth_retry = 1;
-               prepare_write_connect(con->msgr, con, 0);
-               prepare_read_connect(con);
-               break;
-
-       case CEPH_MSGR_TAG_RESETSESSION:
-               /*
-                * If we connected with a large connect_seq but the peer
-                * has no record of a session with us (no connection, or
-                * connect_seq == 0), they will send RESETSESION to indicate
-                * that they must have reset their session, and may have
-                * dropped messages.
-                */
-               dout("process_connect got RESET peer seq %u\n",
-                    le32_to_cpu(con->in_connect.connect_seq));
-               pr_err("%s%lld %s connection reset\n",
-                      ENTITY_NAME(con->peer_name),
-                      pr_addr(&con->peer_addr.in_addr));
-               reset_connection(con);
-               prepare_write_connect(con->msgr, con, 0);
-               prepare_read_connect(con);
-
-               /* Tell ceph about it. */
-               mutex_unlock(&con->mutex);
-               pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
-               if (con->ops->peer_reset)
-                       con->ops->peer_reset(con);
-               mutex_lock(&con->mutex);
-               break;
-
-       case CEPH_MSGR_TAG_RETRY_SESSION:
-               /*
-                * If we sent a smaller connect_seq than the peer has, try
-                * again with a larger value.
-                */
-               dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
-                    le32_to_cpu(con->out_connect.connect_seq),
-                    le32_to_cpu(con->in_connect.connect_seq));
-               con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-               prepare_write_connect(con->msgr, con, 0);
-               prepare_read_connect(con);
-               break;
-
-       case CEPH_MSGR_TAG_RETRY_GLOBAL:
-               /*
-                * If we sent a smaller global_seq than the peer has, try
-                * again with a larger value.
-                */
-               dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-                    con->peer_global_seq,
-                    le32_to_cpu(con->in_connect.global_seq));
-               get_global_seq(con->msgr,
-                              le32_to_cpu(con->in_connect.global_seq));
-               prepare_write_connect(con->msgr, con, 0);
-               prepare_read_connect(con);
-               break;
-
-       case CEPH_MSGR_TAG_READY:
-               if (req_feat & ~server_feat) {
-                       pr_err("%s%lld %s protocol feature mismatch,"
-                              " my required %llx > server's %llx, need %llx\n",
-                              ENTITY_NAME(con->peer_name),
-                              pr_addr(&con->peer_addr.in_addr),
-                              req_feat, server_feat, req_feat & ~server_feat);
-                       con->error_msg = "missing required protocol features";
-                       fail_protocol(con);
-                       return -1;
-               }
-               clear_bit(CONNECTING, &con->state);
-               con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-               con->connect_seq++;
-               con->peer_features = server_feat;
-               dout("process_connect got READY gseq %d cseq %d (%d)\n",
-                    con->peer_global_seq,
-                    le32_to_cpu(con->in_reply.connect_seq),
-                    con->connect_seq);
-               WARN_ON(con->connect_seq !=
-                       le32_to_cpu(con->in_reply.connect_seq));
-
-               if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-                       set_bit(LOSSYTX, &con->state);
-
-               prepare_read_tag(con);
-               break;
-
-       case CEPH_MSGR_TAG_WAIT:
-               /*
-                * If there is a connection race (we are opening
-                * connections to each other), one of us may just have
-                * to WAIT.  This shouldn't happen if we are the
-                * client.
-                */
-               pr_err("process_connect peer connecting WAIT\n");
-
-       default:
-               pr_err("connect protocol error, will retry\n");
-               con->error_msg = "protocol error, garbage tag during connect";
-               return -1;
-       }
-       return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-       int to = 0;
-
-       return read_partial(con, &to, sizeof(con->in_temp_ack),
-                           &con->in_temp_ack);
-}
-
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-       struct ceph_msg *m;
-       u64 ack = le64_to_cpu(con->in_temp_ack);
-       u64 seq;
-
-       while (!list_empty(&con->out_sent)) {
-               m = list_first_entry(&con->out_sent, struct ceph_msg,
-                                    list_head);
-               seq = le64_to_cpu(m->hdr.seq);
-               if (seq > ack)
-                       break;
-               dout("got ack for seq %llu type %d at %p\n", seq,
-                    le16_to_cpu(m->hdr.type), m);
-               ceph_msg_remove(m);
-       }
-       prepare_read_tag(con);
-}
-
-
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-                                       struct kvec *section,
-                                       unsigned int sec_len, u32 *crc)
-{
-       int left;
-       int ret;
-
-       BUG_ON(!section);
-
-       while (section->iov_len < sec_len) {
-               BUG_ON(section->iov_base == NULL);
-               left = sec_len - section->iov_len;
-               ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-                                      section->iov_len, left);
-               if (ret <= 0)
-                       return ret;
-               section->iov_len += ret;
-               if (section->iov_len == sec_len)
-                       *crc = crc32c(0, section->iov_base,
-                                     section->iov_len);
-       }
-
-       return 1;
-}
-
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                               struct ceph_msg_header *hdr,
-                               int *skip);
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-       struct ceph_msg *m = con->in_msg;
-       void *p;
-       int ret;
-       int to, left;
-       unsigned front_len, middle_len, data_len, data_off;
-       int datacrc = con->msgr->nocrc;
-       int skip;
-       u64 seq;
-
-       dout("read_partial_message con %p msg %p\n", con, m);
-
-       /* header */
-       while (con->in_base_pos < sizeof(con->in_hdr)) {
-               left = sizeof(con->in_hdr) - con->in_base_pos;
-               ret = ceph_tcp_recvmsg(con->sock,
-                                      (char *)&con->in_hdr + con->in_base_pos,
-                                      left);
-               if (ret <= 0)
-                       return ret;
-               con->in_base_pos += ret;
-               if (con->in_base_pos == sizeof(con->in_hdr)) {
-                       u32 crc = crc32c(0, (void *)&con->in_hdr,
-                                sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
-                       if (crc != le32_to_cpu(con->in_hdr.crc)) {
-                               pr_err("read_partial_message bad hdr "
-                                      " crc %u != expected %u\n",
-                                      crc, con->in_hdr.crc);
-                               return -EBADMSG;
-                       }
-               }
-       }
-       front_len = le32_to_cpu(con->in_hdr.front_len);
-       if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-               return -EIO;
-       middle_len = le32_to_cpu(con->in_hdr.middle_len);
-       if (middle_len > CEPH_MSG_MAX_DATA_LEN)
-               return -EIO;
-       data_len = le32_to_cpu(con->in_hdr.data_len);
-       if (data_len > CEPH_MSG_MAX_DATA_LEN)
-               return -EIO;
-       data_off = le16_to_cpu(con->in_hdr.data_off);
-
-       /* verify seq# */
-       seq = le64_to_cpu(con->in_hdr.seq);
-       if ((s64)seq - (s64)con->in_seq < 1) {
-               pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
-                       ENTITY_NAME(con->peer_name),
-                       pr_addr(&con->peer_addr.in_addr),
-                       seq, con->in_seq + 1);
-               con->in_base_pos = -front_len - middle_len - data_len -
-                       sizeof(m->footer);
-               con->in_tag = CEPH_MSGR_TAG_READY;
-               con->in_seq++;
-               return 0;
-       } else if ((s64)seq - (s64)con->in_seq > 1) {
-               pr_err("read_partial_message bad seq %lld expected %lld\n",
-                      seq, con->in_seq + 1);
-               con->error_msg = "bad message sequence # for incoming message";
-               return -EBADMSG;
-       }
-
-       /* allocate message? */
-       if (!con->in_msg) {
-               dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-                    con->in_hdr.front_len, con->in_hdr.data_len);
-               skip = 0;
-               con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
-               if (skip) {
-                       /* skip this message */
-                       dout("alloc_msg said skip message\n");
-                       BUG_ON(con->in_msg);
-                       con->in_base_pos = -front_len - middle_len - data_len -
-                               sizeof(m->footer);
-                       con->in_tag = CEPH_MSGR_TAG_READY;
-                       con->in_seq++;
-                       return 0;
-               }
-               if (!con->in_msg) {
-                       con->error_msg =
-                               "error allocating memory for incoming message";
-                       return -ENOMEM;
-               }
-               m = con->in_msg;
-               m->front.iov_len = 0;    /* haven't read it yet */
-               if (m->middle)
-                       m->middle->vec.iov_len = 0;
-
-               con->in_msg_pos.page = 0;
-               con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-               con->in_msg_pos.data_pos = 0;
-       }
-
-       /* front */
-       ret = read_partial_message_section(con, &m->front, front_len,
-                                          &con->in_front_crc);
-       if (ret <= 0)
-               return ret;
-
-       /* middle */
-       if (m->middle) {
-               ret = read_partial_message_section(con, &m->middle->vec,
-                                                  middle_len,
-                                                  &con->in_middle_crc);
-               if (ret <= 0)
-                       return ret;
-       }
-
-       /* (page) data */
-       while (con->in_msg_pos.data_pos < data_len) {
-               left = min((int)(data_len - con->in_msg_pos.data_pos),
-                          (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
-               BUG_ON(m->pages == NULL);
-               p = kmap(m->pages[con->in_msg_pos.page]);
-               ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
-                                      left);
-               if (ret > 0 && datacrc)
-                       con->in_data_crc =
-                               crc32c(con->in_data_crc,
-                                         p + con->in_msg_pos.page_pos, ret);
-               kunmap(m->pages[con->in_msg_pos.page]);
-               if (ret <= 0)
-                       return ret;
-               con->in_msg_pos.data_pos += ret;
-               con->in_msg_pos.page_pos += ret;
-               if (con->in_msg_pos.page_pos == PAGE_SIZE) {
-                       con->in_msg_pos.page_pos = 0;
-                       con->in_msg_pos.page++;
-               }
-       }
-
-       /* footer */
-       to = sizeof(m->hdr) + sizeof(m->footer);
-       while (con->in_base_pos < to) {
-               left = to - con->in_base_pos;
-               ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
-                                      (con->in_base_pos - sizeof(m->hdr)),
-                                      left);
-               if (ret <= 0)
-                       return ret;
-               con->in_base_pos += ret;
-       }
-       dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-            m, front_len, m->footer.front_crc, middle_len,
-            m->footer.middle_crc, data_len, m->footer.data_crc);
-
-       /* crc ok? */
-       if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-               pr_err("read_partial_message %p front crc %u != exp. %u\n",
-                      m, con->in_front_crc, m->footer.front_crc);
-               return -EBADMSG;
-       }
-       if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-               pr_err("read_partial_message %p middle crc %u != exp %u\n",
-                      m, con->in_middle_crc, m->footer.middle_crc);
-               return -EBADMSG;
-       }
-       if (datacrc &&
-           (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-           con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-               pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-                      con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-               return -EBADMSG;
-       }
-
-       return 1; /* done! */
-}
-
-/*
- * Process message.  This happens in the worker thread.  The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
-       struct ceph_msg *msg;
-
-       msg = con->in_msg;
-       con->in_msg = NULL;
-
-       /* if first message, set peer_name */
-       if (con->peer_name.type == 0)
-               con->peer_name = msg->hdr.src;
-
-       con->in_seq++;
-       mutex_unlock(&con->mutex);
-
-       dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
-            msg, le64_to_cpu(msg->hdr.seq),
-            ENTITY_NAME(msg->hdr.src),
-            le16_to_cpu(msg->hdr.type),
-            ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-            le32_to_cpu(msg->hdr.front_len),
-            le32_to_cpu(msg->hdr.data_len),
-            con->in_front_crc, con->in_middle_crc, con->in_data_crc);
-       con->ops->dispatch(con, msg);
-
-       mutex_lock(&con->mutex);
-       prepare_read_tag(con);
-}
-
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
-       struct ceph_messenger *msgr = con->msgr;
-       int ret = 1;
-
-       dout("try_write start %p state %lu nref %d\n", con, con->state,
-            atomic_read(&con->nref));
-
-more:
-       dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-
-       /* open the socket first? */
-       if (con->sock == NULL) {
-               /*
-                * if we were STANDBY and are reconnecting _this_
-                * connection, bump connect_seq now.  Always bump
-                * global_seq.
-                */
-               if (test_and_clear_bit(STANDBY, &con->state))
-                       con->connect_seq++;
-
-               prepare_write_banner(msgr, con);
-               prepare_write_connect(msgr, con, 1);
-               prepare_read_banner(con);
-               set_bit(CONNECTING, &con->state);
-               clear_bit(NEGOTIATING, &con->state);
-
-               BUG_ON(con->in_msg);
-               con->in_tag = CEPH_MSGR_TAG_READY;
-               dout("try_write initiating connect on %p new state %lu\n",
-                    con, con->state);
-               con->sock = ceph_tcp_connect(con);
-               if (IS_ERR(con->sock)) {
-                       con->sock = NULL;
-                       con->error_msg = "connect error";
-                       ret = -1;
-                       goto out;
-               }
-       }
-
-more_kvec:
-       /* kvec data queued? */
-       if (con->out_skip) {
-               ret = write_partial_skip(con);
-               if (ret <= 0)
-                       goto done;
-               if (ret < 0) {
-                       dout("try_write write_partial_skip err %d\n", ret);
-                       goto done;
-               }
-       }
-       if (con->out_kvec_left) {
-               ret = write_partial_kvec(con);
-               if (ret <= 0)
-                       goto done;
-       }
-
-       /* msg pages? */
-       if (con->out_msg) {
-               if (con->out_msg_done) {
-                       ceph_msg_put(con->out_msg);
-                       con->out_msg = NULL;   /* we're done with this one */
-                       goto do_next;
-               }
-
-               ret = write_partial_msg_pages(con);
-               if (ret == 1)
-                       goto more_kvec;  /* we need to send the footer, too! */
-               if (ret == 0)
-                       goto done;
-               if (ret < 0) {
-                       dout("try_write write_partial_msg_pages err %d\n",
-                            ret);
-                       goto done;
-               }
-       }
-
-do_next:
-       if (!test_bit(CONNECTING, &con->state)) {
-               /* is anything else pending? */
-               if (!list_empty(&con->out_queue)) {
-                       prepare_write_message(con);
-                       goto more;
-               }
-               if (con->in_seq > con->in_seq_acked) {
-                       prepare_write_ack(con);
-                       goto more;
-               }
-               if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
-                       prepare_write_keepalive(con);
-                       goto more;
-               }
-       }
-
-       /* Nothing to do! */
-       clear_bit(WRITE_PENDING, &con->state);
-       dout("try_write nothing else to write.\n");
-done:
-       ret = 0;
-out:
-       dout("try_write done on %p\n", con);
-       return ret;
-}
-
-
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
-       int ret = -1;
-
-       if (!con->sock)
-               return 0;
-
-       if (test_bit(STANDBY, &con->state))
-               return 0;
-
-       dout("try_read start on %p\n", con);
-
-more:
-       dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-            con->in_base_pos);
-       if (test_bit(CONNECTING, &con->state)) {
-               if (!test_bit(NEGOTIATING, &con->state)) {
-                       dout("try_read connecting\n");
-                       ret = read_partial_banner(con);
-                       if (ret <= 0)
-                               goto done;
-                       if (process_banner(con) < 0) {
-                               ret = -1;
-                               goto out;
-                       }
-               }
-               ret = read_partial_connect(con);
-               if (ret <= 0)
-                       goto done;
-               if (process_connect(con) < 0) {
-                       ret = -1;
-                       goto out;
-               }
-               goto more;
-       }
-
-       if (con->in_base_pos < 0) {
-               /*
-                * skipping + discarding content.
-                *
-                * FIXME: there must be a better way to do this!
-                */
-               static char buf[1024];
-               int skip = min(1024, -con->in_base_pos);
-               dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
-               ret = ceph_tcp_recvmsg(con->sock, buf, skip);
-               if (ret <= 0)
-                       goto done;
-               con->in_base_pos += ret;
-               if (con->in_base_pos)
-                       goto more;
-       }
-       if (con->in_tag == CEPH_MSGR_TAG_READY) {
-               /*
-                * what's next?
-                */
-               ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-               if (ret <= 0)
-                       goto done;
-               dout("try_read got tag %d\n", (int)con->in_tag);
-               switch (con->in_tag) {
-               case CEPH_MSGR_TAG_MSG:
-                       prepare_read_message(con);
-                       break;
-               case CEPH_MSGR_TAG_ACK:
-                       prepare_read_ack(con);
-                       break;
-               case CEPH_MSGR_TAG_CLOSE:
-                       set_bit(CLOSED, &con->state);   /* fixme */
-                       goto done;
-               default:
-                       goto bad_tag;
-               }
-       }
-       if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-               ret = read_partial_message(con);
-               if (ret <= 0) {
-                       switch (ret) {
-                       case -EBADMSG:
-                               con->error_msg = "bad crc";
-                               ret = -EIO;
-                               goto out;
-                       case -EIO:
-                               con->error_msg = "io error";
-                               goto out;
-                       default:
-                               goto done;
-                       }
-               }
-               if (con->in_tag == CEPH_MSGR_TAG_READY)
-                       goto more;
-               process_message(con);
-               goto more;
-       }
-       if (con->in_tag == CEPH_MSGR_TAG_ACK) {
-               ret = read_partial_ack(con);
-               if (ret <= 0)
-                       goto done;
-               process_ack(con);
-               goto more;
-       }
-
-done:
-       ret = 0;
-out:
-       dout("try_read done on %p\n", con);
-       return ret;
-
-bad_tag:
-       pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-       con->error_msg = "protocol error, garbage tag";
-       ret = -1;
-       goto out;
-}
-
-
-/*
- * Atomically queue work on a connection.  Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY.  It
- * clears QUEUED and does it's thing.  When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work.  If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
-       if (test_bit(DEAD, &con->state)) {
-               dout("queue_con %p ignoring: DEAD\n",
-                    con);
-               return;
-       }
-
-       if (!con->ops->get(con)) {
-               dout("queue_con %p ref count 0\n", con);
-               return;
-       }
-
-       set_bit(QUEUED, &con->state);
-       if (test_bit(BUSY, &con->state)) {
-               dout("queue_con %p - already BUSY\n", con);
-               con->ops->put(con);
-       } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
-               dout("queue_con %p - already queued\n", con);
-               con->ops->put(con);
-       } else {
-               dout("queue_con %p\n", con);
-       }
-}
-
-/*
- * Do some work on a connection.  Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
-       struct ceph_connection *con = container_of(work, struct ceph_connection,
-                                                  work.work);
-       int backoff = 0;
-
-more:
-       if (test_and_set_bit(BUSY, &con->state) != 0) {
-               dout("con_work %p BUSY already set\n", con);
-               goto out;
-       }
-       dout("con_work %p start, clearing QUEUED\n", con);
-       clear_bit(QUEUED, &con->state);
-
-       mutex_lock(&con->mutex);
-
-       if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
-               dout("con_work CLOSED\n");
-               con_close_socket(con);
-               goto done;
-       }
-       if (test_and_clear_bit(OPENING, &con->state)) {
-               /* reopen w/ new peer */
-               dout("con_work OPENING\n");
-               con_close_socket(con);
-       }
-
-       if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
-           try_read(con) < 0 ||
-           try_write(con) < 0) {
-               mutex_unlock(&con->mutex);
-               backoff = 1;
-               ceph_fault(con);     /* error/fault path */
-               goto done_unlocked;
-       }
-
-done:
-       mutex_unlock(&con->mutex);
-
-done_unlocked:
-       clear_bit(BUSY, &con->state);
-       dout("con->state=%lu\n", con->state);
-       if (test_bit(QUEUED, &con->state)) {
-               if (!backoff || test_bit(OPENING, &con->state)) {
-                       dout("con_work %p QUEUED reset, looping\n", con);
-                       goto more;
-               }
-               dout("con_work %p QUEUED reset, but just faulted\n", con);
-               clear_bit(QUEUED, &con->state);
-       }
-       dout("con_work %p done\n", con);
-
-out:
-       con->ops->put(con);
-}
-
-
-/*
- * Generic error/fault handler.  A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
-       pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-              pr_addr(&con->peer_addr.in_addr), con->error_msg);
-       dout("fault %p state %lu to peer %s\n",
-            con, con->state, pr_addr(&con->peer_addr.in_addr));
-
-       if (test_bit(LOSSYTX, &con->state)) {
-               dout("fault on LOSSYTX channel\n");
-               goto out;
-       }
-
-       mutex_lock(&con->mutex);
-       if (test_bit(CLOSED, &con->state))
-               goto out_unlock;
-
-       con_close_socket(con);
-
-       if (con->in_msg) {
-               ceph_msg_put(con->in_msg);
-               con->in_msg = NULL;
-       }
-
-       /* Requeue anything that hasn't been acked */
-       list_splice_init(&con->out_sent, &con->out_queue);
-
-       /* If there are no messages in the queue, place the connection
-        * in a STANDBY state (i.e., don't try to reconnect just yet). */
-       if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
-               dout("fault setting STANDBY\n");
-               set_bit(STANDBY, &con->state);
-       } else {
-               /* retry after a delay. */
-               if (con->delay == 0)
-                       con->delay = BASE_DELAY_INTERVAL;
-               else if (con->delay < MAX_DELAY_INTERVAL)
-                       con->delay *= 2;
-               dout("fault queueing %p delay %lu\n", con, con->delay);
-               con->ops->get(con);
-               if (queue_delayed_work(ceph_msgr_wq, &con->work,
-                                      round_jiffies_relative(con->delay)) == 0)
-                       con->ops->put(con);
-       }
-
-out_unlock:
-       mutex_unlock(&con->mutex);
-out:
-       /*
-        * in case we faulted due to authentication, invalidate our
-        * current tickets so that we can get new ones.
-        */
-       if (con->auth_retry && con->ops->invalidate_authorizer) {
-               dout("calling invalidate_authorizer()\n");
-               con->ops->invalidate_authorizer(con);
-       }
-
-       if (con->ops->fault)
-               con->ops->fault(con);
-}
-
-
-
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
-       struct ceph_messenger *msgr;
-
-       msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
-       if (msgr == NULL)
-               return ERR_PTR(-ENOMEM);
-
-       spin_lock_init(&msgr->global_seq_lock);
-
-       /* the zero page is needed if a request is "canceled" while the message
-        * is being written over the socket */
-       msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
-       if (!msgr->zero_page) {
-               kfree(msgr);
-               return ERR_PTR(-ENOMEM);
-       }
-       kmap(msgr->zero_page);
-
-       if (myaddr)
-               msgr->inst.addr = *myaddr;
-
-       /* select a random nonce */
-       msgr->inst.addr.type = 0;
-       get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
-       encode_my_addr(msgr);
-
-       dout("messenger_create %p\n", msgr);
-       return msgr;
-}
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
-       dout("destroy %p\n", msgr);
-       kunmap(msgr->zero_page);
-       __free_page(msgr->zero_page);
-       kfree(msgr);
-       dout("destroyed messenger %p\n", msgr);
-}
-
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       if (test_bit(CLOSED, &con->state)) {
-               dout("con_send %p closed, dropping %p\n", con, msg);
-               ceph_msg_put(msg);
-               return;
-       }
-
-       /* set src+dst */
-       msg->hdr.src = con->msgr->inst.name;
-
-       BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
-       msg->needs_out_seq = true;
-
-       /* queue */
-       mutex_lock(&con->mutex);
-       BUG_ON(!list_empty(&msg->list_head));
-       list_add_tail(&msg->list_head, &con->out_queue);
-       dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
-            ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
-            ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
-            le32_to_cpu(msg->hdr.front_len),
-            le32_to_cpu(msg->hdr.middle_len),
-            le32_to_cpu(msg->hdr.data_len));
-       mutex_unlock(&con->mutex);
-
-       /* if there wasn't anything waiting to send before, queue
-        * new work */
-       if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-               queue_con(con);
-}
-
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       mutex_lock(&con->mutex);
-       if (!list_empty(&msg->list_head)) {
-               dout("con_revoke %p msg %p - was on queue\n", con, msg);
-               list_del_init(&msg->list_head);
-               ceph_msg_put(msg);
-               msg->hdr.seq = 0;
-       }
-       if (con->out_msg == msg) {
-               dout("con_revoke %p msg %p - was sending\n", con, msg);
-               con->out_msg = NULL;
-               if (con->out_kvec_is_msg) {
-                       con->out_skip = con->out_kvec_bytes;
-                       con->out_kvec_is_msg = false;
-               }
-               ceph_msg_put(msg);
-               msg->hdr.seq = 0;
-       }
-       mutex_unlock(&con->mutex);
-}
-
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       mutex_lock(&con->mutex);
-       if (con->in_msg && con->in_msg == msg) {
-               unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
-               unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
-               unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-
-               /* skip rest of message */
-               dout("con_revoke_pages %p msg %p revoked\n", con, msg);
-                       con->in_base_pos = con->in_base_pos -
-                               sizeof(struct ceph_msg_header) -
-                               front_len -
-                               middle_len -
-                               data_len -
-                               sizeof(struct ceph_msg_footer);
-               ceph_msg_put(con->in_msg);
-               con->in_msg = NULL;
-               con->in_tag = CEPH_MSGR_TAG_READY;
-               con->in_seq++;
-       } else {
-               dout("con_revoke_pages %p msg %p pages %p no-op\n",
-                    con, con->in_msg, msg);
-       }
-       mutex_unlock(&con->mutex);
-}
-
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
-       if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
-           test_and_set_bit(WRITE_PENDING, &con->state) == 0)
-               queue_con(con);
-}
-
-
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
-       struct ceph_msg *m;
-
-       m = kmalloc(sizeof(*m), flags);
-       if (m == NULL)
-               goto out;
-       kref_init(&m->kref);
-       INIT_LIST_HEAD(&m->list_head);
-
-       m->hdr.tid = 0;
-       m->hdr.type = cpu_to_le16(type);
-       m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
-       m->hdr.version = 0;
-       m->hdr.front_len = cpu_to_le32(front_len);
-       m->hdr.middle_len = 0;
-       m->hdr.data_len = 0;
-       m->hdr.data_off = 0;
-       m->hdr.reserved = 0;
-       m->footer.front_crc = 0;
-       m->footer.middle_crc = 0;
-       m->footer.data_crc = 0;
-       m->footer.flags = 0;
-       m->front_max = front_len;
-       m->front_is_vmalloc = false;
-       m->more_to_follow = false;
-       m->pool = NULL;
-
-       /* front */
-       if (front_len) {
-               if (front_len > PAGE_CACHE_SIZE) {
-                       m->front.iov_base = __vmalloc(front_len, flags,
-                                                     PAGE_KERNEL);
-                       m->front_is_vmalloc = true;
-               } else {
-                       m->front.iov_base = kmalloc(front_len, flags);
-               }
-               if (m->front.iov_base == NULL) {
-                       pr_err("msg_new can't allocate %d bytes\n",
-                            front_len);
-                       goto out2;
-               }
-       } else {
-               m->front.iov_base = NULL;
-       }
-       m->front.iov_len = front_len;
-
-       /* middle */
-       m->middle = NULL;
-
-       /* data */
-       m->nr_pages = 0;
-       m->pages = NULL;
-       m->pagelist = NULL;
-
-       dout("ceph_msg_new %p front %d\n", m, front_len);
-       return m;
-
-out2:
-       ceph_msg_put(m);
-out:
-       pr_err("msg_new can't create type %d front %d\n", type, front_len);
-       return NULL;
-}
-
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg.  This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       int type = le16_to_cpu(msg->hdr.type);
-       int middle_len = le32_to_cpu(msg->hdr.middle_len);
-
-       dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
-            ceph_msg_type_name(type), middle_len);
-       BUG_ON(!middle_len);
-       BUG_ON(msg->middle);
-
-       msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
-       if (!msg->middle)
-               return -ENOMEM;
-       return 0;
-}
-
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-                               struct ceph_msg_header *hdr,
-                               int *skip)
-{
-       int type = le16_to_cpu(hdr->type);
-       int front_len = le32_to_cpu(hdr->front_len);
-       int middle_len = le32_to_cpu(hdr->middle_len);
-       struct ceph_msg *msg = NULL;
-       int ret;
-
-       if (con->ops->alloc_msg) {
-               mutex_unlock(&con->mutex);
-               msg = con->ops->alloc_msg(con, hdr, skip);
-               mutex_lock(&con->mutex);
-               if (!msg || *skip)
-                       return NULL;
-       }
-       if (!msg) {
-               *skip = 0;
-               msg = ceph_msg_new(type, front_len, GFP_NOFS);
-               if (!msg) {
-                       pr_err("unable to allocate msg type %d len %d\n",
-                              type, front_len);
-                       return NULL;
-               }
-       }
-       memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-
-       if (middle_len && !msg->middle) {
-               ret = ceph_alloc_middle(con, msg);
-               if (ret < 0) {
-                       ceph_msg_put(msg);
-                       return NULL;
-               }
-       }
-
-       return msg;
-}
-
-
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
-       dout("msg_kfree %p\n", m);
-       if (m->front_is_vmalloc)
-               vfree(m->front.iov_base);
-       else
-               kfree(m->front.iov_base);
-       kfree(m);
-}
-
-/*
- * Drop a msg ref.  Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
-       struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-
-       dout("ceph_msg_put last one on %p\n", m);
-       WARN_ON(!list_empty(&m->list_head));
-
-       /* drop middle, data, if any */
-       if (m->middle) {
-               ceph_buffer_put(m->middle);
-               m->middle = NULL;
-       }
-       m->nr_pages = 0;
-       m->pages = NULL;
-
-       if (m->pagelist) {
-               ceph_pagelist_release(m->pagelist);
-               kfree(m->pagelist);
-               m->pagelist = NULL;
-       }
-
-       if (m->pool)
-               ceph_msgpool_put(m->pool, m);
-       else
-               ceph_msg_kfree(m);
-}
-
-void ceph_msg_dump(struct ceph_msg *msg)
-{
-       pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-                msg->front_max, msg->nr_pages);
-       print_hex_dump(KERN_DEBUG, "header: ",
-                      DUMP_PREFIX_OFFSET, 16, 1,
-                      &msg->hdr, sizeof(msg->hdr), true);
-       print_hex_dump(KERN_DEBUG, " front: ",
-                      DUMP_PREFIX_OFFSET, 16, 1,
-                      msg->front.iov_base, msg->front.iov_len, true);
-       if (msg->middle)
-               print_hex_dump(KERN_DEBUG, "middle: ",
-                              DUMP_PREFIX_OFFSET, 16, 1,
-                              msg->middle->vec.iov_base,
-                              msg->middle->vec.iov_len, true);
-       print_hex_dump(KERN_DEBUG, "footer: ",
-                      DUMP_PREFIX_OFFSET, 16, 1,
-                      &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h

deleted file mode 100644 (file)

index 76fbc95..0000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-
-#include "types.h"
-#include "buffer.h"
-
-struct ceph_msg;
-struct ceph_connection;
-
-extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
-
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
-       struct ceph_connection *(*get)(struct ceph_connection *);
-       void (*put)(struct ceph_connection *);
-
-       /* handle an incoming message. */
-       void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-
-       /* authorize an outgoing connection */
-       int (*get_authorizer) (struct ceph_connection *con,
-                              void **buf, int *len, int *proto,
-                              void **reply_buf, int *reply_len, int force_new);
-       int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
-       int (*invalidate_authorizer)(struct ceph_connection *con);
-
-       /* protocol version mismatch */
-       void (*bad_proto) (struct ceph_connection *con);
-
-       /* there was some error on the socket (disconnect, whatever) */
-       void (*fault) (struct ceph_connection *con);
-
-       /* a remote host as terminated a message exchange session, and messages
-        * we sent (or they tried to send us) may be lost. */
-       void (*peer_reset) (struct ceph_connection *con);
-
-       struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-                                       struct ceph_msg_header *hdr,
-                                       int *skip);
-};
-
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-
-struct ceph_messenger {
-       struct ceph_entity_inst inst;    /* my name+address */
-       struct ceph_entity_addr my_enc_addr;
-       struct page *zero_page;          /* used in certain error cases */
-
-       bool nocrc;
-
-       /*
-        * the global_seq counts connections i (attempt to) initiate
-        * in order to disambiguate certain connect race conditions.
-        */
-       u32 global_seq;
-       spinlock_t global_seq_lock;
-};
-
-/*
- * a single message.  it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
-       struct ceph_msg_header hdr;     /* header */
-       struct ceph_msg_footer footer;  /* footer */
-       struct kvec front;              /* unaligned blobs of message */
-       struct ceph_buffer *middle;
-       struct page **pages;            /* data payload.  NOT OWNER. */
-       unsigned nr_pages;              /* size of page array */
-       struct ceph_pagelist *pagelist; /* instead of pages */
-       struct list_head list_head;
-       struct kref kref;
-       bool front_is_vmalloc;
-       bool more_to_follow;
-       bool needs_out_seq;
-       int front_max;
-
-       struct ceph_msgpool *pool;
-};
-
-struct ceph_msg_pos {
-       int page, page_pos;  /* which page; offset in page */
-       int data_pos;        /* offset in data payload */
-       int did_page_crc;    /* true if we've calculated crc for current page */
-};
-
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL    (HZ/2)
-#define MAX_DELAY_INTERVAL     (5 * 60 * HZ)
-
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define CONNECTING     1
-#define NEGOTIATING    2
-#define KEEPALIVE_PENDING      3
-#define WRITE_PENDING  4  /* we have data ready to send */
-#define QUEUED          5  /* there is work queued on this connection */
-#define BUSY            6  /* work is being done */
-#define STANDBY                8  /* no outgoing messages, socket closed.  we keep
-                           * the ceph_connection around to maintain shared
-                           * state with the peer. */
-#define CLOSED         10 /* we've closed the connection */
-#define SOCK_CLOSED    11 /* socket state changed to closed */
-#define OPENING         13 /* open connection w/ (possibly new) peer */
-#define DEAD            14 /* dead, about to kfree */
-
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
-       void *private;
-       atomic_t nref;
-
-       const struct ceph_connection_operations *ops;
-
-       struct ceph_messenger *msgr;
-       struct socket *sock;
-       unsigned long state;    /* connection state (see flags above) */
-       const char *error_msg;  /* error message, if any */
-
-       struct ceph_entity_addr peer_addr; /* peer address */
-       struct ceph_entity_name peer_name; /* peer name */
-       struct ceph_entity_addr peer_addr_for_me;
-       unsigned peer_features;
-       u32 connect_seq;      /* identify the most recent connection
-                                attempt for this connection, client */
-       u32 peer_global_seq;  /* peer's global seq for this connection */
-
-       int auth_retry;       /* true if we need a newer authorizer */
-       void *auth_reply_buf;   /* where to put the authorizer reply */
-       int auth_reply_buf_len;
-
-       struct mutex mutex;
-
-       /* out queue */
-       struct list_head out_queue;
-       struct list_head out_sent;   /* sending or sent but unacked */
-       u64 out_seq;                 /* last message queued for send */
-       bool out_keepalive_pending;
-
-       u64 in_seq, in_seq_acked;  /* last message received, acked */
-
-       /* connection negotiation temps */
-       char in_banner[CEPH_BANNER_MAX_LEN];
-       union {
-               struct {  /* outgoing connection */
-                       struct ceph_msg_connect out_connect;
-                       struct ceph_msg_connect_reply in_reply;
-               };
-               struct {  /* incoming */
-                       struct ceph_msg_connect in_connect;
-                       struct ceph_msg_connect_reply out_reply;
-               };
-       };
-       struct ceph_entity_addr actual_peer_addr;
-
-       /* message out temps */
-       struct ceph_msg *out_msg;        /* sending message (== tail of
-                                           out_sent) */
-       bool out_msg_done;
-       struct ceph_msg_pos out_msg_pos;
-
-       struct kvec out_kvec[8],         /* sending header/footer data */
-               *out_kvec_cur;
-       int out_kvec_left;   /* kvec's left in out_kvec */
-       int out_skip;        /* skip this many bytes */
-       int out_kvec_bytes;  /* total bytes left */
-       bool out_kvec_is_msg; /* kvec refers to out_msg */
-       int out_more;        /* there is more data after the kvecs */
-       __le64 out_temp_ack; /* for writing an ack */
-
-       /* message in temps */
-       struct ceph_msg_header in_hdr;
-       struct ceph_msg *in_msg;
-       struct ceph_msg_pos in_msg_pos;
-       u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
-
-       char in_tag;         /* protocol control byte */
-       int in_base_pos;     /* bytes read */
-       __le64 in_temp_ack;  /* for reading an ack */
-
-       struct delayed_work work;           /* send|recv work */
-       unsigned long       delay;          /* current delay interval */
-};
-
-
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
-                         struct ceph_entity_addr *addr,
-                         int max_count, int *count);
-
-
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-
-extern struct ceph_messenger *ceph_messenger_create(
-       struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-
-extern void ceph_con_init(struct ceph_messenger *msgr,
-                         struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
-                         struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
-                                 struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
-       kref_get(&msg->kref);
-       return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
-       kref_put(&msg->kref, ceph_msg_last_put);
-}
-
-extern void ceph_msg_dump(struct ceph_msg *msg);
-
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c

deleted file mode 100644 (file)

index b2a5a3e..0000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-
-/*
- * Interact with Ceph monitor cluster.  Handle requests for new map
- * versions, and periodically resend as needed.  Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information.  An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates.  We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure.  If the connection does break, we
- * randomly hunt for a new monitor.  Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-
-static const struct ceph_connection_operations mon_con_ops;
-
-static int __validate_auth(struct ceph_mon_client *monc);
-
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
-       struct ceph_monmap *m = NULL;
-       int i, err = -EINVAL;
-       struct ceph_fsid fsid;
-       u32 epoch, num_mon;
-       u16 version;
-       u32 len;
-
-       ceph_decode_32_safe(&p, end, len, bad);
-       ceph_decode_need(&p, end, len, bad);
-
-       dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
-       ceph_decode_16_safe(&p, end, version, bad);
-
-       ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-       ceph_decode_copy(&p, &fsid, sizeof(fsid));
-       epoch = ceph_decode_32(&p);
-
-       num_mon = ceph_decode_32(&p);
-       ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-
-       if (num_mon >= CEPH_MAX_MON)
-               goto bad;
-       m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
-       if (m == NULL)
-               return ERR_PTR(-ENOMEM);
-       m->fsid = fsid;
-       m->epoch = epoch;
-       m->num_mon = num_mon;
-       ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-       for (i = 0; i < num_mon; i++)
-               ceph_decode_addr(&m->mon_inst[i].addr);
-
-       dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-            m->num_mon);
-       for (i = 0; i < m->num_mon; i++)
-               dout("monmap_decode  mon%d is %s\n", i,
-                    pr_addr(&m->mon_inst[i].addr.in_addr));
-       return m;
-
-bad:
-       dout("monmap_decode failed with %d\n", err);
-       kfree(m);
-       return ERR_PTR(err);
-}
-
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
-       int i;
-
-       for (i = 0; i < m->num_mon; i++)
-               if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
-                       return 1;
-       return 0;
-}
-
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-       monc->pending_auth = 1;
-       monc->m_auth->front.iov_len = len;
-       monc->m_auth->hdr.front_len = cpu_to_le32(len);
-       ceph_con_revoke(monc->con, monc->m_auth);
-       ceph_msg_get(monc->m_auth);  /* keep our ref */
-       ceph_con_send(monc->con, monc->m_auth);
-}
-
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
-       if (monc->con) {
-               dout("__close_session closing mon%d\n", monc->cur_mon);
-               ceph_con_revoke(monc->con, monc->m_auth);
-               ceph_con_close(monc->con);
-               monc->cur_mon = -1;
-               monc->pending_auth = 0;
-               ceph_auth_reset(monc->auth);
-       }
-}
-
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
-       char r;
-       int ret;
-
-       if (monc->cur_mon < 0) {
-               get_random_bytes(&r, 1);
-               monc->cur_mon = r % monc->monmap->num_mon;
-               dout("open_session num=%d r=%d -> mon%d\n",
-                    monc->monmap->num_mon, r, monc->cur_mon);
-               monc->sub_sent = 0;
-               monc->sub_renew_after = jiffies;  /* i.e., expired */
-               monc->want_next_osdmap = !!monc->want_next_osdmap;
-
-               dout("open_session mon%d opening\n", monc->cur_mon);
-               monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
-               monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
-               ceph_con_open(monc->con,
-                             &monc->monmap->mon_inst[monc->cur_mon].addr);
-
-               /* initiatiate authentication handshake */
-               ret = ceph_auth_build_hello(monc->auth,
-                                           monc->m_auth->front.iov_base,
-                                           monc->m_auth->front_max);
-               __send_prepared_auth_request(monc, ret);
-       } else {
-               dout("open_session mon%d already open\n", monc->cur_mon);
-       }
-       return 0;
-}
-
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
-       return time_after_eq(jiffies, monc->sub_renew_after);
-}
-
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
-       unsigned delay;
-
-       if (monc->cur_mon < 0 || __sub_expired(monc))
-               delay = 10 * HZ;
-       else
-               delay = 20 * HZ;
-       dout("__schedule_delayed after %u\n", delay);
-       schedule_delayed_work(&monc->delayed_work, delay);
-}
-
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
-       dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
-            (unsigned)monc->sub_sent, __sub_expired(monc),
-            monc->want_next_osdmap);
-       if ((__sub_expired(monc) && !monc->sub_sent) ||
-           monc->want_next_osdmap == 1) {
-               struct ceph_msg *msg = monc->m_subscribe;
-               struct ceph_mon_subscribe_item *i;
-               void *p, *end;
-
-               p = msg->front.iov_base;
-               end = p + msg->front_max;
-
-               dout("__send_subscribe to 'mdsmap' %u+\n",
-                    (unsigned)monc->have_mdsmap);
-               if (monc->want_next_osdmap) {
-                       dout("__send_subscribe to 'osdmap' %u\n",
-                            (unsigned)monc->have_osdmap);
-                       ceph_encode_32(&p, 3);
-                       ceph_encode_string(&p, end, "osdmap", 6);
-                       i = p;
-                       i->have = cpu_to_le64(monc->have_osdmap);
-                       i->onetime = 1;
-                       p += sizeof(*i);
-                       monc->want_next_osdmap = 2;  /* requested */
-               } else {
-                       ceph_encode_32(&p, 2);
-               }
-               ceph_encode_string(&p, end, "mdsmap", 6);
-               i = p;
-               i->have = cpu_to_le64(monc->have_mdsmap);
-               i->onetime = 0;
-               p += sizeof(*i);
-               ceph_encode_string(&p, end, "monmap", 6);
-               i = p;
-               i->have = 0;
-               i->onetime = 0;
-               p += sizeof(*i);
-
-               msg->front.iov_len = p - msg->front.iov_base;
-               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-               ceph_con_revoke(monc->con, msg);
-               ceph_con_send(monc->con, ceph_msg_get(msg));
-
-               monc->sub_sent = jiffies | 1;  /* never 0 */
-       }
-}
-
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-       unsigned seconds;
-       struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-
-       if (msg->front.iov_len < sizeof(*h))
-               goto bad;
-       seconds = le32_to_cpu(h->duration);
-
-       mutex_lock(&monc->mutex);
-       if (monc->hunting) {
-               pr_info("mon%d %s session established\n",
-                       monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
-               monc->hunting = false;
-       }
-       dout("handle_subscribe_ack after %d seconds\n", seconds);
-       monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
-       monc->sub_sent = 0;
-       mutex_unlock(&monc->mutex);
-       return;
-bad:
-       pr_err("got corrupt subscribe-ack msg\n");
-       ceph_msg_dump(msg);
-}
-
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
-       mutex_lock(&monc->mutex);
-       monc->have_mdsmap = got;
-       mutex_unlock(&monc->mutex);
-       return 0;
-}
-
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
-       mutex_lock(&monc->mutex);
-       monc->have_osdmap = got;
-       monc->want_next_osdmap = 0;
-       mutex_unlock(&monc->mutex);
-       return 0;
-}
-
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
-       dout("request_next_osdmap have %u\n", monc->have_osdmap);
-       mutex_lock(&monc->mutex);
-       if (!monc->want_next_osdmap)
-               monc->want_next_osdmap = 1;
-       if (monc->want_next_osdmap < 2)
-               __send_subscribe(monc);
-       mutex_unlock(&monc->mutex);
-}
-
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
-       if (!monc->con) {
-               monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
-               if (!monc->con)
-                       return -ENOMEM;
-               ceph_con_init(monc->client->msgr, monc->con);
-               monc->con->private = monc;
-               monc->con->ops = &mon_con_ops;
-       }
-
-       mutex_lock(&monc->mutex);
-       __open_session(monc);
-       __schedule_delayed(monc);
-       mutex_unlock(&monc->mutex);
-       return 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
-                                struct ceph_msg *msg)
-{
-       struct ceph_client *client = monc->client;
-       struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-       void *p, *end;
-
-       mutex_lock(&monc->mutex);
-
-       dout("handle_monmap\n");
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
-
-       monmap = ceph_monmap_decode(p, end);
-       if (IS_ERR(monmap)) {
-               pr_err("problem decoding monmap, %d\n",
-                      (int)PTR_ERR(monmap));
-               goto out;
-       }
-
-       if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
-               kfree(monmap);
-               goto out;
-       }
-
-       client->monc.monmap = monmap;
-       kfree(old);
-
-out:
-       mutex_unlock(&monc->mutex);
-       wake_up_all(&client->auth_wq);
-}
-
-/*
- * generic requests (e.g., statfs, poolop)
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
-       struct ceph_mon_client *monc, u64 tid)
-{
-       struct ceph_mon_generic_request *req;
-       struct rb_node *n = monc->generic_request_tree.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_mon_generic_request, node);
-               if (tid < req->tid)
-                       n = n->rb_left;
-               else if (tid > req->tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
-                           struct ceph_mon_generic_request *new)
-{
-       struct rb_node **p = &monc->generic_request_tree.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_mon_generic_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_mon_generic_request, node);
-               if (new->tid < req->tid)
-                       p = &(*p)->rb_left;
-               else if (new->tid > req->tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-
-static void release_generic_request(struct kref *kref)
-{
-       struct ceph_mon_generic_request *req =
-               container_of(kref, struct ceph_mon_generic_request, kref);
-
-       if (req->reply)
-               ceph_msg_put(req->reply);
-       if (req->request)
-               ceph_msg_put(req->request);
-
-       kfree(req);
-}
-
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
-       kref_put(&req->kref, release_generic_request);
-}
-
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
-       kref_get(&req->kref);
-}
-
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
-                                        struct ceph_msg_header *hdr,
-                                        int *skip)
-{
-       struct ceph_mon_client *monc = con->private;
-       struct ceph_mon_generic_request *req;
-       u64 tid = le64_to_cpu(hdr->tid);
-       struct ceph_msg *m;
-
-       mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
-       if (!req) {
-               dout("get_generic_reply %lld dne\n", tid);
-               *skip = 1;
-               m = NULL;
-       } else {
-               dout("get_generic_reply %lld got %p\n", tid, req->reply);
-               m = ceph_msg_get(req->reply);
-               /*
-                * we don't need to track the connection reading into
-                * this reply because we only have one open connection
-                * at a time, ever.
-                */
-       }
-       mutex_unlock(&monc->mutex);
-       return m;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
-                             struct ceph_mon_generic_request *req)
-{
-       int err;
-
-       /* register request */
-       mutex_lock(&monc->mutex);
-       req->tid = ++monc->last_tid;
-       req->request->hdr.tid = cpu_to_le64(req->tid);
-       __insert_generic_request(monc, req);
-       monc->num_generic_requests++;
-       ceph_con_send(monc->con, ceph_msg_get(req->request));
-       mutex_unlock(&monc->mutex);
-
-       err = wait_for_completion_interruptible(&req->completion);
-
-       mutex_lock(&monc->mutex);
-       rb_erase(&req->node, &monc->generic_request_tree);
-       monc->num_generic_requests--;
-       mutex_unlock(&monc->mutex);
-
-       if (!err)
-               err = req->result;
-       return err;
-}
-
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
-                               struct ceph_msg *msg)
-{
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-       u64 tid = le64_to_cpu(msg->hdr.tid);
-
-       if (msg->front.iov_len != sizeof(*reply))
-               goto bad;
-       dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-
-       mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
-       if (req) {
-               *(struct ceph_statfs *)req->buf = reply->st;
-               req->result = 0;
-               get_generic_request(req);
-       }
-       mutex_unlock(&monc->mutex);
-       if (req) {
-               complete_all(&req->completion);
-               put_generic_request(req);
-       }
-       return;
-
-bad:
-       pr_err("corrupt generic reply, tid %llu\n", tid);
-       ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_statfs *h;
-       int err;
-
-       req = kzalloc(sizeof(*req), GFP_NOFS);
-       if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = buf;
-       req->buf_len = sizeof(*buf);
-       init_completion(&req->completion);
-
-       err = -ENOMEM;
-       req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
-       if (!req->request)
-               goto out;
-       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
-       if (!req->reply)
-               goto out;
-
-       /* fill out request */
-       h = req->request->front.iov_base;
-       h->monhdr.have_version = 0;
-       h->monhdr.session_mon = cpu_to_le16(-1);
-       h->monhdr.session_mon_tid = 0;
-       h->fsid = monc->monmap->fsid;
-
-       err = do_generic_request(monc, req);
-
-out:
-       kref_put(&req->kref, release_generic_request);
-       return err;
-}
-
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
-                               char *dst, size_t dst_len)
-{
-       u32 buf_len;
-
-       if (src_len != sizeof(u32) + dst_len)
-               return -EINVAL;
-
-       buf_len = le32_to_cpu(*(u32 *)src);
-       if (buf_len != dst_len)
-               return -EINVAL;
-
-       memcpy(dst, src + sizeof(u32), dst_len);
-       return 0;
-}
-
-static void handle_poolop_reply(struct ceph_mon_client *monc,
-                               struct ceph_msg *msg)
-{
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
-       u64 tid = le64_to_cpu(msg->hdr.tid);
-
-       if (msg->front.iov_len < sizeof(*reply))
-               goto bad;
-       dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-
-       mutex_lock(&monc->mutex);
-       req = __lookup_generic_req(monc, tid);
-       if (req) {
-               if (req->buf_len &&
-                   get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
-                                    msg->front.iov_len - sizeof(*reply),
-                                    req->buf, req->buf_len) < 0) {
-                       mutex_unlock(&monc->mutex);
-                       goto bad;
-               }
-               req->result = le32_to_cpu(reply->reply_code);
-               get_generic_request(req);
-       }
-       mutex_unlock(&monc->mutex);
-       if (req) {
-               complete(&req->completion);
-               put_generic_request(req);
-       }
-       return;
-
-bad:
-       pr_err("corrupt generic reply, tid %llu\n", tid);
-       ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous pool op.
- */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
-                       u32 pool, u64 snapid,
-                       char *buf, int len)
-{
-       struct ceph_mon_generic_request *req;
-       struct ceph_mon_poolop *h;
-       int err;
-
-       req = kzalloc(sizeof(*req), GFP_NOFS);
-       if (!req)
-               return -ENOMEM;
-
-       kref_init(&req->kref);
-       req->buf = buf;
-       req->buf_len = len;
-       init_completion(&req->completion);
-
-       err = -ENOMEM;
-       req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
-       if (!req->request)
-               goto out;
-       req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
-       if (!req->reply)
-               goto out;
-
-       /* fill out request */
-       req->request->hdr.version = cpu_to_le16(2);
-       h = req->request->front.iov_base;
-       h->monhdr.have_version = 0;
-       h->monhdr.session_mon = cpu_to_le16(-1);
-       h->monhdr.session_mon_tid = 0;
-       h->fsid = monc->monmap->fsid;
-       h->pool = cpu_to_le32(pool);
-       h->op = cpu_to_le32(op);
-       h->auid = 0;
-       h->snapid = cpu_to_le64(snapid);
-       h->name_len = 0;
-
-       err = do_generic_request(monc, req);
-
-out:
-       kref_put(&req->kref, release_generic_request);
-       return err;
-}
-
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                           u32 pool, u64 *snapid)
-{
-       return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                  pool, 0, (char *)snapid, sizeof(*snapid));
-
-}
-
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                           u32 pool, u64 snapid)
-{
-       return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
-                                  pool, snapid, 0, 0);
-
-}
-
-/*
- * Resend pending generic requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
-       struct ceph_mon_generic_request *req;
-       struct rb_node *p;
-
-       for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-               req = rb_entry(p, struct ceph_mon_generic_request, node);
-               ceph_con_revoke(monc->con, req->request);
-               ceph_con_send(monc->con, ceph_msg_get(req->request));
-       }
-}
-
-/*
- * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM).  And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
-       struct ceph_mon_client *monc =
-               container_of(work, struct ceph_mon_client, delayed_work.work);
-
-       dout("monc delayed_work\n");
-       mutex_lock(&monc->mutex);
-       if (monc->hunting) {
-               __close_session(monc);
-               __open_session(monc);  /* continue hunting */
-       } else {
-               ceph_con_keepalive(monc->con);
-
-               __validate_auth(monc);
-
-               if (monc->auth->ops->is_authenticated(monc->auth))
-                       __send_subscribe(monc);
-       }
-       __schedule_delayed(monc);
-       mutex_unlock(&monc->mutex);
-}
-
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
-       struct ceph_mount_args *args = monc->client->mount_args;
-       struct ceph_entity_addr *mon_addr = args->mon_addr;
-       int num_mon = args->num_mon;
-       int i;
-
-       /* build initial monmap */
-       monc->monmap = kzalloc(sizeof(*monc->monmap) +
-                              num_mon*sizeof(monc->monmap->mon_inst[0]),
-                              GFP_KERNEL);
-       if (!monc->monmap)
-               return -ENOMEM;
-       for (i = 0; i < num_mon; i++) {
-               monc->monmap->mon_inst[i].addr = mon_addr[i];
-               monc->monmap->mon_inst[i].addr.nonce = 0;
-               monc->monmap->mon_inst[i].name.type =
-                       CEPH_ENTITY_TYPE_MON;
-               monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
-       }
-       monc->monmap->num_mon = num_mon;
-       monc->have_fsid = false;
-
-       /* release addr memory */
-       kfree(args->mon_addr);
-       args->mon_addr = NULL;
-       args->num_mon = 0;
-       return 0;
-}
-
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
-       int err = 0;
-
-       dout("init\n");
-       memset(monc, 0, sizeof(*monc));
-       monc->client = cl;
-       monc->monmap = NULL;
-       mutex_init(&monc->mutex);
-
-       err = build_initial_monmap(monc);
-       if (err)
-               goto out;
-
-       monc->con = NULL;
-
-       /* authentication */
-       monc->auth = ceph_auth_init(cl->mount_args->name,
-                                   cl->mount_args->secret);
-       if (IS_ERR(monc->auth))
-               return PTR_ERR(monc->auth);
-       monc->auth->want_keys =
-               CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
-               CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-
-       /* msgs */
-       err = -ENOMEM;
-       monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-                                    sizeof(struct ceph_mon_subscribe_ack),
-                                    GFP_NOFS);
-       if (!monc->m_subscribe_ack)
-               goto out_monmap;
-
-       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-       if (!monc->m_subscribe)
-               goto out_subscribe_ack;
-
-       monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-       if (!monc->m_auth_reply)
-               goto out_subscribe;
-
-       monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
-       monc->pending_auth = 0;
-       if (!monc->m_auth)
-               goto out_auth_reply;
-
-       monc->cur_mon = -1;
-       monc->hunting = true;
-       monc->sub_renew_after = jiffies;
-       monc->sub_sent = 0;
-
-       INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-       monc->generic_request_tree = RB_ROOT;
-       monc->num_generic_requests = 0;
-       monc->last_tid = 0;
-
-       monc->have_mdsmap = 0;
-       monc->have_osdmap = 0;
-       monc->want_next_osdmap = 1;
-       return 0;
-
-out_auth_reply:
-       ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
-       ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
-       ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
-       kfree(monc->monmap);
-out:
-       return err;
-}
-
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
-       dout("stop\n");
-       cancel_delayed_work_sync(&monc->delayed_work);
-
-       mutex_lock(&monc->mutex);
-       __close_session(monc);
-       if (monc->con) {
-               monc->con->private = NULL;
-               monc->con->ops->put(monc->con);
-               monc->con = NULL;
-       }
-       mutex_unlock(&monc->mutex);
-
-       ceph_auth_destroy(monc->auth);
-
-       ceph_msg_put(monc->m_auth);
-       ceph_msg_put(monc->m_auth_reply);
-       ceph_msg_put(monc->m_subscribe);
-       ceph_msg_put(monc->m_subscribe_ack);
-
-       kfree(monc->monmap);
-}
-
-static void handle_auth_reply(struct ceph_mon_client *monc,
-                             struct ceph_msg *msg)
-{
-       int ret;
-       int was_auth = 0;
-
-       mutex_lock(&monc->mutex);
-       if (monc->auth->ops)
-               was_auth = monc->auth->ops->is_authenticated(monc->auth);
-       monc->pending_auth = 0;
-       ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-                                    msg->front.iov_len,
-                                    monc->m_auth->front.iov_base,
-                                    monc->m_auth->front_max);
-       if (ret < 0) {
-               monc->client->auth_err = ret;
-               wake_up_all(&monc->client->auth_wq);
-       } else if (ret > 0) {
-               __send_prepared_auth_request(monc, ret);
-       } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
-               dout("authenticated, starting session\n");
-
-               monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-               monc->client->msgr->inst.name.num =
-                                       cpu_to_le64(monc->auth->global_id);
-
-               __send_subscribe(monc);
-               __resend_generic_request(monc);
-       }
-       mutex_unlock(&monc->mutex);
-}
-
-static int __validate_auth(struct ceph_mon_client *monc)
-{
-       int ret;
-
-       if (monc->pending_auth)
-               return 0;
-
-       ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
-                             monc->m_auth->front_max);
-       if (ret <= 0)
-               return ret; /* either an error, or no need to authenticate */
-       __send_prepared_auth_request(monc, ret);
-       return 0;
-}
-
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
-       int ret;
-
-       mutex_lock(&monc->mutex);
-       ret = __validate_auth(monc);
-       mutex_unlock(&monc->mutex);
-       return ret;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       struct ceph_mon_client *monc = con->private;
-       int type = le16_to_cpu(msg->hdr.type);
-
-       if (!monc)
-               return;
-
-       switch (type) {
-       case CEPH_MSG_AUTH_REPLY:
-               handle_auth_reply(monc, msg);
-               break;
-
-       case CEPH_MSG_MON_SUBSCRIBE_ACK:
-               handle_subscribe_ack(monc, msg);
-               break;
-
-       case CEPH_MSG_STATFS_REPLY:
-               handle_statfs_reply(monc, msg);
-               break;
-
-       case CEPH_MSG_POOLOP_REPLY:
-               handle_poolop_reply(monc, msg);
-               break;
-
-       case CEPH_MSG_MON_MAP:
-               ceph_monc_handle_map(monc, msg);
-               break;
-
-       case CEPH_MSG_MDS_MAP:
-               ceph_mdsc_handle_map(&monc->client->mdsc, msg);
-               break;
-
-       case CEPH_MSG_OSD_MAP:
-               ceph_osdc_handle_map(&monc->client->osdc, msg);
-               break;
-
-       default:
-               pr_err("received unknown message type %d %s\n", type,
-                      ceph_msg_type_name(type));
-       }
-       ceph_msg_put(msg);
-}
-
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-                                     struct ceph_msg_header *hdr,
-                                     int *skip)
-{
-       struct ceph_mon_client *monc = con->private;
-       int type = le16_to_cpu(hdr->type);
-       int front_len = le32_to_cpu(hdr->front_len);
-       struct ceph_msg *m = NULL;
-
-       *skip = 0;
-
-       switch (type) {
-       case CEPH_MSG_MON_SUBSCRIBE_ACK:
-               m = ceph_msg_get(monc->m_subscribe_ack);
-               break;
-       case CEPH_MSG_POOLOP_REPLY:
-       case CEPH_MSG_STATFS_REPLY:
-               return get_generic_reply(con, hdr, skip);
-       case CEPH_MSG_AUTH_REPLY:
-               m = ceph_msg_get(monc->m_auth_reply);
-               break;
-       case CEPH_MSG_MON_MAP:
-       case CEPH_MSG_MDS_MAP:
-       case CEPH_MSG_OSD_MAP:
-               m = ceph_msg_new(type, front_len, GFP_NOFS);
-               break;
-       }
-
-       if (!m) {
-               pr_info("alloc_msg unknown type %d\n", type);
-               *skip = 1;
-       }
-       return m;
-}
-
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
-       struct ceph_mon_client *monc = con->private;
-
-       if (!monc)
-               return;
-
-       dout("mon_fault\n");
-       mutex_lock(&monc->mutex);
-       if (!con->private)
-               goto out;
-
-       if (monc->con && !monc->hunting)
-               pr_info("mon%d %s session lost, "
-                       "hunting for new mon\n", monc->cur_mon,
-                       pr_addr(&monc->con->peer_addr.in_addr));
-
-       __close_session(monc);
-       if (!monc->hunting) {
-               /* start hunting */
-               monc->hunting = true;
-               __open_session(monc);
-       } else {
-               /* already hunting, let's wait a bit */
-               __schedule_delayed(monc);
-       }
-out:
-       mutex_unlock(&monc->mutex);
-}
-
-static const struct ceph_connection_operations mon_con_ops = {
-       .get = ceph_con_get,
-       .put = ceph_con_put,
-       .dispatch = dispatch,
-       .fault = mon_fault,
-       .alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h

deleted file mode 100644 (file)

index 8e396f2..0000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-
-#include "messenger.h"
-
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
-       struct ceph_fsid fsid;
-       u32 epoch;
-       u32 num_mon;
-       struct ceph_entity_inst mon_inst[0];
-};
-
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-
-
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
-                                        int newmon);
-
-/* a pending monitor request */
-struct ceph_mon_request {
-       struct ceph_mon_client *monc;
-       struct delayed_work delayed_work;
-       unsigned long delay;
-       ceph_monc_request_func_t do_request;
-};
-
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
-       struct kref kref;
-       u64 tid;
-       struct rb_node node;
-       int result;
-       void *buf;
-       int buf_len;
-       struct completion completion;
-       struct ceph_msg *request;  /* original request */
-       struct ceph_msg *reply;    /* and reply */
-};
-
-struct ceph_mon_client {
-       struct ceph_client *client;
-       struct ceph_monmap *monmap;
-
-       struct mutex mutex;
-       struct delayed_work delayed_work;
-
-       struct ceph_auth_client *auth;
-       struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
-       int pending_auth;
-
-       bool hunting;
-       int cur_mon;                       /* last monitor i contacted */
-       unsigned long sub_sent, sub_renew_after;
-       struct ceph_connection *con;
-       bool have_fsid;
-
-       /* pending generic requests */
-       struct rb_root generic_request_tree;
-       int num_generic_requests;
-       u64 last_tid;
-
-       /* mds/osd map */
-       int want_next_osdmap; /* 1 = want, 2 = want+asked */
-       u32 have_osdmap, have_mdsmap;
-
-#ifdef CONFIG_DEBUG_FS
-       struct dentry *debugfs_file;
-#endif
-};
-
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
-                               struct ceph_entity_addr *addr);
-
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map.  We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
-                              struct ceph_statfs *buf);
-
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
-                                  u32 pool, u64 *snapid);
-
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
-                                  u32 pool, u64 snapid);
-
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c

deleted file mode 100644 (file)

index dd65a64..0000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-
-#include "msgpool.h"
-
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
-       struct ceph_msgpool *pool = arg;
-       void *p;
-
-       p = ceph_msg_new(0, pool->front_len, gfp_mask);
-       if (!p)
-               pr_err("msgpool %s alloc failed\n", pool->name);
-       return p;
-}
-
-static void free_fn(void *element, void *arg)
-{
-       ceph_msg_put(element);
-}
-
-int ceph_msgpool_init(struct ceph_msgpool *pool,
-                     int front_len, int size, bool blocking, const char *name)
-{
-       pool->front_len = front_len;
-       pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-       if (!pool->pool)
-               return -ENOMEM;
-       pool->name = name;
-       return 0;
-}
-
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
-       mempool_destroy(pool->pool);
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
-                                 int front_len)
-{
-       if (front_len > pool->front_len) {
-               pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-                      pool->name, front_len, pool->front_len);
-               WARN_ON(1);
-
-               /* try to alloc a fresh message */
-               return ceph_msg_new(0, front_len, GFP_NOFS);
-       }
-
-       return mempool_alloc(pool->pool, GFP_NOFS);
-}
-
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
-       /* reset msg front_len; user may have changed it */
-       msg->front.iov_len = pool->front_len;
-       msg->hdr.front_len = cpu_to_le32(pool->front_len);
-
-       kref_init(&msg->kref);  /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h

deleted file mode 100644 (file)

index a362605..0000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-
-#include <linux/mempool.h>
-#include "messenger.h"
-
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
-       const char *name;
-       mempool_t *pool;
-       int front_len;          /* preallocated payload size */
-};
-
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                            int front_len, int size, bool blocking,
-                            const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
-                                        int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h

deleted file mode 100644 (file)

index 680d3d6..0000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CEPH_MSGR_H
-#define CEPH_MSGR_H
-
-/*
- * Data types for message passing layer used by Ceph.
- */
-
-#define CEPH_MON_PORT    6789  /* default monitor port */
-
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
-/*
- * tcp connection banner.  include a protocol version. and adjust
- * whenever the wire protocol changes.  try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-
-
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
-       return (__s32)a - (__s32)b;
-}
-
-
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
-       __u8 type;      /* CEPH_ENTITY_TYPE_* */
-       __le64 num;
-} __attribute__ ((packed));
-
-#define CEPH_ENTITY_TYPE_MON    0x01
-#define CEPH_ENTITY_TYPE_MDS    0x02
-#define CEPH_ENTITY_TYPE_OSD    0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH   0x20
-
-#define CEPH_ENTITY_TYPE_ANY    0xFF
-
-extern const char *ceph_entity_type_name(int type);
-
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
-       __le32 type;
-       __le32 nonce;  /* unique id for process (e.g. pid) */
-       struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-
-struct ceph_entity_inst {
-       struct ceph_entity_name name;
-       struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-
-
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
-                                         incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
-                                         with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
-                                         with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
-#define CEPH_MSGR_TAG_MSG           7  /* message */
-#define CEPH_MSGR_TAG_ACK           8  /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
-
-
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
-       __le64 features;     /* supported feature bits */
-       __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
-       __le32 global_seq;   /* count connections initiated by this host */
-       __le32 connect_seq;  /* count connections initiated in this session */
-       __le32 protocol_version;
-       __le32 authorizer_protocol;
-       __le32 authorizer_len;
-       __u8  flags;         /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-
-struct ceph_msg_connect_reply {
-       __u8 tag;
-       __le64 features;     /* feature bits for this session */
-       __le32 global_seq;
-       __le32 connect_seq;
-       __le32 protocol_version;
-       __le32 authorizer_len;
-       __u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
-
-
-/*
- * message header
- */
-struct ceph_msg_header_old {
-       __le64 seq;       /* message seq# for this session */
-       __le64 tid;       /* transaction id */
-       __le16 type;      /* message type */
-       __le16 priority;  /* priority.  higher value == higher priority */
-       __le16 version;   /* version of message encoding */
-
-       __le32 front_len; /* bytes in main payload */
-       __le32 middle_len;/* bytes in middle payload */
-       __le32 data_len;  /* bytes of data payload */
-       __le16 data_off;  /* sender: include full offset;
-                            receiver: mask against ~PAGE_MASK */
-
-       struct ceph_entity_inst src, orig_src;
-       __le32 reserved;
-       __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-struct ceph_msg_header {
-       __le64 seq;       /* message seq# for this session */
-       __le64 tid;       /* transaction id */
-       __le16 type;      /* message type */
-       __le16 priority;  /* priority.  higher value == higher priority */
-       __le16 version;   /* version of message encoding */
-
-       __le32 front_len; /* bytes in main payload */
-       __le32 middle_len;/* bytes in middle payload */
-       __le32 data_len;  /* bytes of data payload */
-       __le16 data_off;  /* sender: include full offset;
-                            receiver: mask against ~PAGE_MASK */
-
-       struct ceph_entity_name src;
-       __le32 reserved;
-       __le32 crc;       /* header crc32c */
-} __attribute__ ((packed));
-
-#define CEPH_MSG_PRIO_LOW     64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH    196
-#define CEPH_MSG_PRIO_HIGHEST 255
-
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
-       __le32 front_crc, middle_crc, data_crc;
-       __u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
-
-
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c

deleted file mode 100644 (file)

index 3b5571b..0000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-
-#define OSD_OP_FRONT_LEN       4096
-#define OSD_OPREPLY_FRONT_LEN  512
-
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
-                         struct ceph_osd *kickosd);
-
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly.  shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
-                       struct ceph_vino vino, struct ceph_file_layout *layout,
-                       u64 off, u64 *plen,
-                       struct ceph_osd_request *req)
-{
-       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-       struct ceph_osd_op *op = (void *)(reqhead + 1);
-       u64 orig_len = *plen;
-       u64 objoff, objlen;    /* extent in object */
-       u64 bno;
-
-       reqhead->snapid = cpu_to_le64(vino.snap);
-
-       /* object extent? */
-       ceph_calc_file_object_mapping(layout, off, plen, &bno,
-                                     &objoff, &objlen);
-       if (*plen < orig_len)
-               dout(" skipping last %llu, final file extent %llu~%llu\n",
-                    orig_len - *plen, off, *plen);
-
-       sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
-       req->r_oid_len = strlen(req->r_oid);
-
-       op->extent.offset = cpu_to_le64(objoff);
-       op->extent.length = cpu_to_le64(objlen);
-       req->r_num_pages = calc_pages_for(off, *plen);
-
-       dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
-            req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
-       struct ceph_osd_request *req = container_of(kref,
-                                                   struct ceph_osd_request,
-                                                   r_kref);
-
-       if (req->r_request)
-               ceph_msg_put(req->r_request);
-       if (req->r_reply)
-               ceph_msg_put(req->r_reply);
-       if (req->r_con_filling_msg) {
-               dout("release_request revoking pages %p from con %p\n",
-                    req->r_pages, req->r_con_filling_msg);
-               ceph_con_revoke_message(req->r_con_filling_msg,
-                                     req->r_reply);
-               ceph_con_put(req->r_con_filling_msg);
-       }
-       if (req->r_own_pages)
-               ceph_release_page_vector(req->r_pages,
-                                        req->r_num_pages);
-       ceph_put_snap_context(req->r_snapc);
-       if (req->r_mempool)
-               mempool_free(req, req->r_osdc->req_mempool);
-       else
-               kfree(req);
-}
-
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately.  (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
-                                              struct ceph_file_layout *layout,
-                                              struct ceph_vino vino,
-                                              u64 off, u64 *plen,
-                                              int opcode, int flags,
-                                              struct ceph_snap_context *snapc,
-                                              int do_sync,
-                                              u32 truncate_seq,
-                                              u64 truncate_size,
-                                              struct timespec *mtime,
-                                              bool use_mempool, int num_reply)
-{
-       struct ceph_osd_request *req;
-       struct ceph_msg *msg;
-       struct ceph_osd_request_head *head;
-       struct ceph_osd_op *op;
-       void *p;
-       int num_op = 1 + do_sync;
-       size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-       int i;
-
-       if (use_mempool) {
-               req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
-               memset(req, 0, sizeof(*req));
-       } else {
-               req = kzalloc(sizeof(*req), GFP_NOFS);
-       }
-       if (req == NULL)
-               return NULL;
-
-       req->r_osdc = osdc;
-       req->r_mempool = use_mempool;
-       kref_init(&req->r_kref);
-       init_completion(&req->r_completion);
-       init_completion(&req->r_safe_completion);
-       INIT_LIST_HEAD(&req->r_unsafe_item);
-       req->r_flags = flags;
-
-       WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
-       /* create reply message */
-       if (use_mempool)
-               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
-       else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                  OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
-       req->r_reply = msg;
-
-       /* create request message; allow space for oid */
-       msg_size += 40;
-       if (snapc)
-               msg_size += sizeof(u64) * snapc->num_snaps;
-       if (use_mempool)
-               msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
-       else
-               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-       if (!msg) {
-               ceph_osdc_put_request(req);
-               return NULL;
-       }
-       msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
-       memset(msg->front.iov_base, 0, msg->front.iov_len);
-       head = msg->front.iov_base;
-       op = (void *)(head + 1);
-       p = (void *)(op + num_op);
-
-       req->r_request = msg;
-       req->r_snapc = ceph_get_snap_context(snapc);
-
-       head->client_inc = cpu_to_le32(1); /* always, for now. */
-       head->flags = cpu_to_le32(flags);
-       if (flags & CEPH_OSD_FLAG_WRITE)
-               ceph_encode_timespec(&head->mtime, mtime);
-       head->num_ops = cpu_to_le16(num_op);
-       op->op = cpu_to_le16(opcode);
-
-       /* calculate max write size */
-       calc_layout(osdc, vino, layout, off, plen, req);
-       req->r_file_layout = *layout;  /* keep a copy */
-
-       if (flags & CEPH_OSD_FLAG_WRITE) {
-               req->r_request->hdr.data_off = cpu_to_le16(off);
-               req->r_request->hdr.data_len = cpu_to_le32(*plen);
-               op->payload_len = cpu_to_le32(*plen);
-       }
-       op->extent.truncate_size = cpu_to_le64(truncate_size);
-       op->extent.truncate_seq = cpu_to_le32(truncate_seq);
-
-       /* fill in oid */
-       head->object_len = cpu_to_le32(req->r_oid_len);
-       memcpy(p, req->r_oid, req->r_oid_len);
-       p += req->r_oid_len;
-
-       if (do_sync) {
-               op++;
-               op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
-       }
-       if (snapc) {
-               head->snap_seq = cpu_to_le64(snapc->seq);
-               head->num_snaps = cpu_to_le32(snapc->num_snaps);
-               for (i = 0; i < snapc->num_snaps; i++) {
-                       put_unaligned_le64(snapc->snaps[i], p);
-                       p += sizeof(u64);
-               }
-       }
-
-       BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
-       msg_size = p - msg->front.iov_base;
-       msg->front.iov_len = msg_size;
-       msg->hdr.front_len = cpu_to_le32(msg_size);
-       return req;
-}
-
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *new)
-{
-       struct rb_node **p = &osdc->requests.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd_request *req = NULL;
-
-       while (*p) {
-               parent = *p;
-               req = rb_entry(parent, struct ceph_osd_request, r_node);
-               if (new->r_tid < req->r_tid)
-                       p = &(*p)->rb_left;
-               else if (new->r_tid > req->r_tid)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->r_node, parent, p);
-       rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
-                                                u64 tid)
-{
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid)
-                       n = n->rb_left;
-               else if (tid > req->r_tid)
-                       n = n->rb_right;
-               else
-                       return req;
-       }
-       return NULL;
-}
-
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
-                   u64 tid)
-{
-       struct ceph_osd_request *req;
-       struct rb_node *n = osdc->requests.rb_node;
-
-       while (n) {
-               req = rb_entry(n, struct ceph_osd_request, r_node);
-               if (tid < req->r_tid) {
-                       if (!n->rb_left)
-                               return req;
-                       n = n->rb_left;
-               } else if (tid > req->r_tid) {
-                       n = n->rb_right;
-               } else {
-                       return req;
-               }
-       }
-       return NULL;
-}
-
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
-
-       if (!osd)
-               return;
-       dout("osd_reset osd%d\n", osd->o_osd);
-       osdc = osd->o_osdc;
-       down_read(&osdc->map_sem);
-       kick_requests(osdc, osd);
-       up_read(&osdc->map_sem);
-}
-
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
-       struct ceph_osd *osd;
-
-       osd = kzalloc(sizeof(*osd), GFP_NOFS);
-       if (!osd)
-               return NULL;
-
-       atomic_set(&osd->o_ref, 1);
-       osd->o_osdc = osdc;
-       INIT_LIST_HEAD(&osd->o_requests);
-       INIT_LIST_HEAD(&osd->o_osd_lru);
-       osd->o_incarnation = 1;
-
-       ceph_con_init(osdc->client->msgr, &osd->o_con);
-       osd->o_con.private = osd;
-       osd->o_con.ops = &osd_con_ops;
-       osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-
-       INIT_LIST_HEAD(&osd->o_keepalive_item);
-       return osd;
-}
-
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
-       if (atomic_inc_not_zero(&osd->o_ref)) {
-               dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-                    atomic_read(&osd->o_ref));
-               return osd;
-       } else {
-               dout("get_osd %p FAIL\n", osd);
-               return NULL;
-       }
-}
-
-static void put_osd(struct ceph_osd *osd)
-{
-       dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-            atomic_read(&osd->o_ref) - 1);
-       if (atomic_dec_and_test(&osd->o_ref)) {
-               struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
-               if (osd->o_authorizer)
-                       ac->ops->destroy_authorizer(ac, osd->o_authorizer);
-               kfree(osd);
-       }
-}
-
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       dout("__remove_osd %p\n", osd);
-       BUG_ON(!list_empty(&osd->o_requests));
-       rb_erase(&osd->o_node, &osdc->osds);
-       list_del_init(&osd->o_osd_lru);
-       ceph_con_close(&osd->o_con);
-       put_osd(osd);
-}
-
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
-                             struct ceph_osd *osd)
-{
-       dout("__move_osd_to_lru %p\n", osd);
-       BUG_ON(!list_empty(&osd->o_osd_lru));
-       list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-       osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
-       dout("__remove_osd_from_lru %p\n", osd);
-       if (!list_empty(&osd->o_osd_lru))
-               list_del_init(&osd->o_osd_lru);
-}
-
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
-       struct ceph_osd *osd, *nosd;
-
-       dout("__remove_old_osds %p\n", osdc);
-       mutex_lock(&osdc->request_mutex);
-       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
-               if (!remove_all && time_before(jiffies, osd->lru_ttl))
-                       break;
-               __remove_osd(osdc, osd);
-       }
-       mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
-       struct ceph_osd_request *req;
-       int ret = 0;
-
-       dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
-       if (list_empty(&osd->o_requests)) {
-               __remove_osd(osdc, osd);
-       } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
-                         &osd->o_con.peer_addr,
-                         sizeof(osd->o_con.peer_addr)) == 0 &&
-                  !ceph_con_opened(&osd->o_con)) {
-               dout(" osd addr hasn't changed and connection never opened,"
-                    " letting msgr retry");
-               /* touch each r_stamp for handle_timeout()'s benfit */
-               list_for_each_entry(req, &osd->o_requests, r_osd_item)
-                       req->r_stamp = jiffies;
-               ret = -EAGAIN;
-       } else {
-               ceph_con_close(&osd->o_con);
-               ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
-               osd->o_incarnation++;
-       }
-       return ret;
-}
-
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
-       struct rb_node **p = &osdc->osds.rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_osd *osd = NULL;
-
-       while (*p) {
-               parent = *p;
-               osd = rb_entry(parent, struct ceph_osd, o_node);
-               if (new->o_osd < osd->o_osd)
-                       p = &(*p)->rb_left;
-               else if (new->o_osd > osd->o_osd)
-                       p = &(*p)->rb_right;
-               else
-                       BUG();
-       }
-
-       rb_link_node(&new->o_node, parent, p);
-       rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
-       struct ceph_osd *osd;
-       struct rb_node *n = osdc->osds.rb_node;
-
-       while (n) {
-               osd = rb_entry(n, struct ceph_osd, o_node);
-               if (o < osd->o_osd)
-                       n = n->rb_left;
-               else if (o > osd->o_osd)
-                       n = n->rb_right;
-               else
-                       return osd;
-       }
-       return NULL;
-}
-
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
-       schedule_delayed_work(&osdc->timeout_work,
-                       osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
-       cancel_delayed_work(&osdc->timeout_work);
-}
-
-/*
- * Register request, assign tid.  If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
-                            struct ceph_osd_request *req)
-{
-       mutex_lock(&osdc->request_mutex);
-       req->r_tid = ++osdc->last_tid;
-       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
-       INIT_LIST_HEAD(&req->r_req_lru_item);
-
-       dout("register_request %p tid %lld\n", req, req->r_tid);
-       __insert_request(osdc, req);
-       ceph_osdc_get_request(req);
-       osdc->num_requests++;
-
-       if (osdc->num_requests == 1) {
-               dout(" first request, scheduling timeout\n");
-               __schedule_osd_timeout(osdc);
-       }
-       mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
-                                struct ceph_osd_request *req)
-{
-       dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-       rb_erase(&req->r_node, &osdc->requests);
-       osdc->num_requests--;
-
-       if (req->r_osd) {
-               /* make sure the original request isn't in flight. */
-               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-
-               list_del_init(&req->r_osd_item);
-               if (list_empty(&req->r_osd->o_requests))
-                       __move_osd_to_lru(osdc, req->r_osd);
-               req->r_osd = NULL;
-       }
-
-       ceph_osdc_put_request(req);
-
-       list_del_init(&req->r_req_lru_item);
-       if (osdc->num_requests == 0) {
-               dout(" no requests, canceling timeout\n");
-               __cancel_osd_timeout(osdc);
-       }
-}
-
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
-       if (req->r_sent && req->r_osd) {
-               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-               req->r_sent = 0;
-       }
-       list_del_init(&req->r_req_lru_item);
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately.  If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
-                     struct ceph_osd_request *req)
-{
-       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-       struct ceph_pg pgid;
-       int acting[CEPH_PG_MAX_SIZE];
-       int o = -1, num = 0;
-       int err;
-
-       dout("map_osds %p tid %lld\n", req, req->r_tid);
-       err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
-                                     &req->r_file_layout, osdc->osdmap);
-       if (err)
-               return err;
-       pgid = reqhead->layout.ol_pgid;
-       req->r_pgid = pgid;
-
-       err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
-       if (err > 0) {
-               o = acting[0];
-               num = err;
-       }
-
-       if ((req->r_osd && req->r_osd->o_osd == o &&
-            req->r_sent >= req->r_osd->o_incarnation &&
-            req->r_num_pg_osds == num &&
-            memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
-           (req->r_osd == NULL && o == -1))
-               return 0;  /* no change */
-
-       dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
-            req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
-            req->r_osd ? req->r_osd->o_osd : -1);
-
-       /* record full pg acting set */
-       memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
-       req->r_num_pg_osds = num;
-
-       if (req->r_osd) {
-               __cancel_request(req);
-               list_del_init(&req->r_osd_item);
-               req->r_osd = NULL;
-       }
-
-       req->r_osd = __lookup_osd(osdc, o);
-       if (!req->r_osd && o >= 0) {
-               err = -ENOMEM;
-               req->r_osd = create_osd(osdc);
-               if (!req->r_osd)
-                       goto out;
-
-               dout("map_osds osd %p is osd%d\n", req->r_osd, o);
-               req->r_osd->o_osd = o;
-               req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
-               __insert_osd(osdc, req->r_osd);
-
-               ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
-       }
-
-       if (req->r_osd) {
-               __remove_osd_from_lru(req->r_osd);
-               list_add(&req->r_osd_item, &req->r_osd->o_requests);
-       }
-       err = 1;   /* osd or pg changed */
-
-out:
-       return err;
-}
-
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
-                         struct ceph_osd_request *req)
-{
-       struct ceph_osd_request_head *reqhead;
-       int err;
-
-       err = __map_osds(osdc, req);
-       if (err < 0)
-               return err;
-       if (req->r_osd == NULL) {
-               dout("send_request %p no up osds in pg\n", req);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-               return 0;
-       }
-
-       dout("send_request %p tid %llu to osd%d flags %d\n",
-            req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
-       reqhead = req->r_request->front.iov_base;
-       reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
-       reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
-       reqhead->reassert_version = req->r_reassert_version;
-
-       req->r_stamp = jiffies;
-       list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-
-       ceph_msg_get(req->r_request); /* send consumes a ref */
-       ceph_con_send(&req->r_osd->o_con, req->r_request);
-       req->r_sent = req->r_osd->o_incarnation;
-       return 0;
-}
-
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds.  When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected.  Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
-       struct ceph_osd_client *osdc =
-               container_of(work, struct ceph_osd_client, timeout_work.work);
-       struct ceph_osd_request *req, *last_req = NULL;
-       struct ceph_osd *osd;
-       unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-       unsigned long keepalive =
-               osdc->client->mount_args->osd_keepalive_timeout * HZ;
-       unsigned long last_stamp = 0;
-       struct rb_node *p;
-       struct list_head slow_osds;
-
-       dout("timeout\n");
-       down_read(&osdc->map_sem);
-
-       ceph_monc_request_next_osdmap(&osdc->client->monc);
-
-       mutex_lock(&osdc->request_mutex);
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-
-               if (req->r_resend) {
-                       int err;
-
-                       dout("osdc resending prev failed %lld\n", req->r_tid);
-                       err = __send_request(osdc, req);
-                       if (err)
-                               dout("osdc failed again on %lld\n", req->r_tid);
-                       else
-                               req->r_resend = false;
-                       continue;
-               }
-       }
-
-       /*
-        * reset osds that appear to be _really_ unresponsive.  this
-        * is a failsafe measure.. we really shouldn't be getting to
-        * this point if the system is working properly.  the monitors
-        * should mark the osd as failed and we should find out about
-        * it from an updated osd map.
-        */
-       while (timeout && !list_empty(&osdc->req_lru)) {
-               req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
-                                r_req_lru_item);
-
-               if (time_before(jiffies, req->r_stamp + timeout))
-                       break;
-
-               BUG_ON(req == last_req && req->r_stamp == last_stamp);
-               last_req = req;
-               last_stamp = req->r_stamp;
-
-               osd = req->r_osd;
-               BUG_ON(!osd);
-               pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
-                          req->r_tid, osd->o_osd);
-               __kick_requests(osdc, osd);
-       }
-
-       /*
-        * ping osds that are a bit slow.  this ensures that if there
-        * is a break in the TCP connection we will notice, and reopen
-        * a connection with that osd (from the fault callback).
-        */
-       INIT_LIST_HEAD(&slow_osds);
-       list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-               if (time_before(jiffies, req->r_stamp + keepalive))
-                       break;
-
-               osd = req->r_osd;
-               BUG_ON(!osd);
-               dout(" tid %llu is slow, will send keepalive on osd%d\n",
-                    req->r_tid, osd->o_osd);
-               list_move_tail(&osd->o_keepalive_item, &slow_osds);
-       }
-       while (!list_empty(&slow_osds)) {
-               osd = list_entry(slow_osds.next, struct ceph_osd,
-                                o_keepalive_item);
-               list_del_init(&osd->o_keepalive_item);
-               ceph_con_keepalive(&osd->o_con);
-       }
-
-       __schedule_osd_timeout(osdc);
-       mutex_unlock(&osdc->request_mutex);
-
-       up_read(&osdc->map_sem);
-}
-
-static void handle_osds_timeout(struct work_struct *work)
-{
-       struct ceph_osd_client *osdc =
-               container_of(work, struct ceph_osd_client,
-                            osds_timeout_work.work);
-       unsigned long delay =
-               osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-
-       dout("osds timeout\n");
-       down_read(&osdc->map_sem);
-       remove_old_osds(osdc, 0);
-       up_read(&osdc->map_sem);
-
-       schedule_delayed_work(&osdc->osds_timeout_work,
-                             round_jiffies_relative(delay));
-}
-
-/*
- * handle osd op reply.  either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
-                        struct ceph_connection *con)
-{
-       struct ceph_osd_reply_head *rhead = msg->front.iov_base;
-       struct ceph_osd_request *req;
-       u64 tid;
-       int numops, object_len, flags;
-       s32 result;
-
-       tid = le64_to_cpu(msg->hdr.tid);
-       if (msg->front.iov_len < sizeof(*rhead))
-               goto bad;
-       numops = le32_to_cpu(rhead->num_ops);
-       object_len = le32_to_cpu(rhead->object_len);
-       result = le32_to_cpu(rhead->result);
-       if (msg->front.iov_len != sizeof(*rhead) + object_len +
-           numops * sizeof(struct ceph_osd_op))
-               goto bad;
-       dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-
-       /* lookup */
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
-       if (req == NULL) {
-               dout("handle_reply tid %llu dne\n", tid);
-               mutex_unlock(&osdc->request_mutex);
-               return;
-       }
-       ceph_osdc_get_request(req);
-       flags = le32_to_cpu(rhead->flags);
-
-       /*
-        * if this connection filled our message, drop our reference now, to
-        * avoid a (safe but slower) revoke later.
-        */
-       if (req->r_con_filling_msg == con && req->r_reply == msg) {
-               dout(" dropping con_filling_msg ref %p\n", con);
-               req->r_con_filling_msg = NULL;
-               ceph_con_put(con);
-       }
-
-       if (!req->r_got_reply) {
-               unsigned bytes;
-
-               req->r_result = le32_to_cpu(rhead->result);
-               bytes = le32_to_cpu(msg->hdr.data_len);
-               dout("handle_reply result %d bytes %d\n", req->r_result,
-                    bytes);
-               if (req->r_result == 0)
-                       req->r_result = bytes;
-
-               /* in case this is a write and we need to replay, */
-               req->r_reassert_version = rhead->reassert_version;
-
-               req->r_got_reply = 1;
-       } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
-               dout("handle_reply tid %llu dup ack\n", tid);
-               mutex_unlock(&osdc->request_mutex);
-               goto done;
-       }
-
-       dout("handle_reply tid %llu flags %d\n", tid, flags);
-
-       /* either this is a read, or we got the safe response */
-       if (result < 0 ||
-           (flags & CEPH_OSD_FLAG_ONDISK) ||
-           ((flags & CEPH_OSD_FLAG_WRITE) == 0))
-               __unregister_request(osdc, req);
-
-       mutex_unlock(&osdc->request_mutex);
-
-       if (req->r_callback)
-               req->r_callback(req, msg);
-       else
-               complete_all(&req->r_completion);
-
-       if (flags & CEPH_OSD_FLAG_ONDISK) {
-               if (req->r_safe_callback)
-                       req->r_safe_callback(req, msg);
-               complete_all(&req->r_safe_completion);  /* fsync waiter */
-       }
-
-done:
-       ceph_osdc_put_request(req);
-       return;
-
-bad:
-       pr_err("corrupt osd_op_reply got %d %d expected %d\n",
-              (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
-              (int)sizeof(*rhead));
-       ceph_msg_dump(msg);
-}
-
-
-static int __kick_requests(struct ceph_osd_client *osdc,
-                         struct ceph_osd *kickosd)
-{
-       struct ceph_osd_request *req;
-       struct rb_node *p, *n;
-       int needmap = 0;
-       int err;
-
-       dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-       if (kickosd) {
-               err = __reset_osd(osdc, kickosd);
-               if (err == -EAGAIN)
-                       return 1;
-       } else {
-               for (p = rb_first(&osdc->osds); p; p = n) {
-                       struct ceph_osd *osd =
-                               rb_entry(p, struct ceph_osd, o_node);
-
-                       n = rb_next(p);
-                       if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-                           memcmp(&osd->o_con.peer_addr,
-                                  ceph_osd_addr(osdc->osdmap,
-                                                osd->o_osd),
-                                  sizeof(struct ceph_entity_addr)) != 0)
-                               __reset_osd(osdc, osd);
-               }
-       }
-
-       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
-               req = rb_entry(p, struct ceph_osd_request, r_node);
-
-               if (req->r_resend) {
-                       dout(" r_resend set on tid %llu\n", req->r_tid);
-                       __cancel_request(req);
-                       goto kick;
-               }
-               if (req->r_osd && kickosd == req->r_osd) {
-                       __cancel_request(req);
-                       goto kick;
-               }
-
-               err = __map_osds(osdc, req);
-               if (err == 0)
-                       continue;  /* no change */
-               if (err < 0) {
-                       /*
-                        * FIXME: really, we should set the request
-                        * error and fail if this isn't a 'nofail'
-                        * request, but that's a fair bit more
-                        * complicated to do.  So retry!
-                        */
-                       dout(" setting r_resend on %llu\n", req->r_tid);
-                       req->r_resend = true;
-                       continue;
-               }
-               if (req->r_osd == NULL) {
-                       dout("tid %llu maps to no valid osd\n", req->r_tid);
-                       needmap++;  /* request a newer map */
-                       continue;
-               }
-
-kick:
-               dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-                    req->r_osd ? req->r_osd->o_osd : -1);
-               req->r_flags |= CEPH_OSD_FLAG_RETRY;
-               err = __send_request(osdc, req);
-               if (err) {
-                       dout(" setting r_resend on %llu\n", req->r_tid);
-                       req->r_resend = true;
-               }
-       }
-
-       return needmap;
-}
-
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
-                         struct ceph_osd *kickosd)
-{
-       int needmap;
-
-       mutex_lock(&osdc->request_mutex);
-       needmap = __kick_requests(osdc, kickosd);
-       mutex_unlock(&osdc->request_mutex);
-
-       if (needmap) {
-               dout("%d requests for down osds, need new map\n", needmap);
-               ceph_monc_request_next_osdmap(&osdc->client->monc);
-       }
-
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster.  Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
-       void *p, *end, *next;
-       u32 nr_maps, maplen;
-       u32 epoch;
-       struct ceph_osdmap *newmap = NULL, *oldmap;
-       int err;
-       struct ceph_fsid fsid;
-
-       dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
-       p = msg->front.iov_base;
-       end = p + msg->front.iov_len;
-
-       /* verify fsid */
-       ceph_decode_need(&p, end, sizeof(fsid), bad);
-       ceph_decode_copy(&p, &fsid, sizeof(fsid));
-       if (ceph_check_fsid(osdc->client, &fsid) < 0)
-               return;
-
-       down_write(&osdc->map_sem);
-
-       /* incremental maps */
-       ceph_decode_32_safe(&p, end, nr_maps, bad);
-       dout(" %d inc maps\n", nr_maps);
-       while (nr_maps > 0) {
-               ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-               epoch = ceph_decode_32(&p);
-               maplen = ceph_decode_32(&p);
-               ceph_decode_need(&p, end, maplen, bad);
-               next = p + maplen;
-               if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
-                       dout("applying incremental map %u len %d\n",
-                            epoch, maplen);
-                       newmap = osdmap_apply_incremental(&p, next,
-                                                         osdc->osdmap,
-                                                         osdc->client->msgr);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
-                               goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       if (newmap != osdc->osdmap) {
-                               ceph_osdmap_destroy(osdc->osdmap);
-                               osdc->osdmap = newmap;
-                       }
-               } else {
-                       dout("ignoring incremental map %u len %d\n",
-                            epoch, maplen);
-               }
-               p = next;
-               nr_maps--;
-       }
-       if (newmap)
-               goto done;
-
-       /* full maps */
-       ceph_decode_32_safe(&p, end, nr_maps, bad);
-       dout(" %d full maps\n", nr_maps);
-       while (nr_maps) {
-               ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-               epoch = ceph_decode_32(&p);
-               maplen = ceph_decode_32(&p);
-               ceph_decode_need(&p, end, maplen, bad);
-               if (nr_maps > 1) {
-                       dout("skipping non-latest full map %u len %d\n",
-                            epoch, maplen);
-               } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
-                       dout("skipping full map %u len %d, "
-                            "older than our %u\n", epoch, maplen,
-                            osdc->osdmap->epoch);
-               } else {
-                       dout("taking full map %u len %d\n", epoch, maplen);
-                       newmap = osdmap_decode(&p, p+maplen);
-                       if (IS_ERR(newmap)) {
-                               err = PTR_ERR(newmap);
-                               goto bad;
-                       }
-                       BUG_ON(!newmap);
-                       oldmap = osdc->osdmap;
-                       osdc->osdmap = newmap;
-                       if (oldmap)
-                               ceph_osdmap_destroy(oldmap);
-               }
-               p += maplen;
-               nr_maps--;
-       }
-
-done:
-       downgrade_write(&osdc->map_sem);
-       ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
-       if (newmap)
-               kick_requests(osdc, NULL);
-       up_read(&osdc->map_sem);
-       wake_up_all(&osdc->client->auth_wq);
-       return;
-
-bad:
-       pr_err("osdc handle_map corrupt msg\n");
-       ceph_msg_dump(msg);
-       up_write(&osdc->map_sem);
-       return;
-}
-
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                           struct ceph_osd_request *req,
-                           bool nofail)
-{
-       int rc = 0;
-
-       req->r_request->pages = req->r_pages;
-       req->r_request->nr_pages = req->r_num_pages;
-
-       register_request(osdc, req);
-
-       down_read(&osdc->map_sem);
-       mutex_lock(&osdc->request_mutex);
-       /*
-        * a racing kick_requests() may have sent the message for us
-        * while we dropped request_mutex above, so only send now if
-        * the request still han't been touched yet.
-        */
-       if (req->r_sent == 0) {
-               rc = __send_request(osdc, req);
-               if (rc) {
-                       if (nofail) {
-                               dout("osdc_start_request failed send, "
-                                    " marking %lld\n", req->r_tid);
-                               req->r_resend = true;
-                               rc = 0;
-                       } else {
-                               __unregister_request(osdc, req);
-                       }
-               }
-       }
-       mutex_unlock(&osdc->request_mutex);
-       up_read(&osdc->map_sem);
-       return rc;
-}
-
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                          struct ceph_osd_request *req)
-{
-       int rc;
-
-       rc = wait_for_completion_interruptible(&req->r_completion);
-       if (rc < 0) {
-               mutex_lock(&osdc->request_mutex);
-               __cancel_request(req);
-               __unregister_request(osdc, req);
-               mutex_unlock(&osdc->request_mutex);
-               dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
-               return rc;
-       }
-
-       dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
-       return req->r_result;
-}
-
-/*
- * sync - wait for all in-flight requests to flush.  avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
-       struct ceph_osd_request *req;
-       u64 last_tid, next_tid = 0;
-
-       mutex_lock(&osdc->request_mutex);
-       last_tid = osdc->last_tid;
-       while (1) {
-               req = __lookup_request_ge(osdc, next_tid);
-               if (!req)
-                       break;
-               if (req->r_tid > last_tid)
-                       break;
-
-               next_tid = req->r_tid + 1;
-               if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
-                       continue;
-
-               ceph_osdc_get_request(req);
-               mutex_unlock(&osdc->request_mutex);
-               dout("sync waiting on tid %llu (last is %llu)\n",
-                    req->r_tid, last_tid);
-               wait_for_completion(&req->r_safe_completion);
-               mutex_lock(&osdc->request_mutex);
-               ceph_osdc_put_request(req);
-       }
-       mutex_unlock(&osdc->request_mutex);
-       dout("sync done (thru tid %llu)\n", last_tid);
-}
-
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
-       int err;
-
-       dout("init\n");
-       osdc->client = client;
-       osdc->osdmap = NULL;
-       init_rwsem(&osdc->map_sem);
-       init_completion(&osdc->map_waiters);
-       osdc->last_requested_map = 0;
-       mutex_init(&osdc->request_mutex);
-       osdc->last_tid = 0;
-       osdc->osds = RB_ROOT;
-       INIT_LIST_HEAD(&osdc->osd_lru);
-       osdc->requests = RB_ROOT;
-       INIT_LIST_HEAD(&osdc->req_lru);
-       osdc->num_requests = 0;
-       INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
-       INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-
-       schedule_delayed_work(&osdc->osds_timeout_work,
-          round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-
-       err = -ENOMEM;
-       osdc->req_mempool = mempool_create_kmalloc_pool(10,
-                                       sizeof(struct ceph_osd_request));
-       if (!osdc->req_mempool)
-               goto out;
-
-       err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
-                               "osd_op");
-       if (err < 0)
-               goto out_mempool;
-       err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                               OSD_OPREPLY_FRONT_LEN, 10, true,
-                               "osd_op_reply");
-       if (err < 0)
-               goto out_msgpool;
-       return 0;
-
-out_msgpool:
-       ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
-       mempool_destroy(osdc->req_mempool);
-out:
-       return err;
-}
-
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
-       cancel_delayed_work_sync(&osdc->timeout_work);
-       cancel_delayed_work_sync(&osdc->osds_timeout_work);
-       if (osdc->osdmap) {
-               ceph_osdmap_destroy(osdc->osdmap);
-               osdc->osdmap = NULL;
-       }
-       remove_old_osds(osdc, 1);
-       mempool_destroy(osdc->req_mempool);
-       ceph_msgpool_destroy(&osdc->msgpool_op);
-       ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-
-/*
- * Read some contiguous pages.  If we cross a stripe boundary, shorten
- * *plen.  Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                       struct ceph_vino vino, struct ceph_file_layout *layout,
-                       u64 off, u64 *plen,
-                       u32 truncate_seq, u64 truncate_size,
-                       struct page **pages, int num_pages)
-{
-       struct ceph_osd_request *req;
-       int rc = 0;
-
-       dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
-            vino.snap, off, *plen);
-       req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
-                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-                                   NULL, 0, truncate_seq, truncate_size, NULL,
-                                   false, 1);
-       if (!req)
-               return -ENOMEM;
-
-       /* it may be a short read due to an object boundary */
-       req->r_pages = pages;
-
-       dout("readpages  final extent is %llu~%llu (%d pages)\n",
-            off, *plen, req->r_num_pages);
-
-       rc = ceph_osdc_start_request(osdc, req, false);
-       if (!rc)
-               rc = ceph_osdc_wait_request(osdc, req);
-
-       ceph_osdc_put_request(req);
-       dout("readpages result %d\n", rc);
-       return rc;
-}
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
-                        struct ceph_file_layout *layout,
-                        struct ceph_snap_context *snapc,
-                        u64 off, u64 len,
-                        u32 truncate_seq, u64 truncate_size,
-                        struct timespec *mtime,
-                        struct page **pages, int num_pages,
-                        int flags, int do_sync, bool nofail)
-{
-       struct ceph_osd_request *req;
-       int rc = 0;
-
-       BUG_ON(vino.snap != CEPH_NOSNAP);
-       req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
-                                   CEPH_OSD_OP_WRITE,
-                                   flags | CEPH_OSD_FLAG_ONDISK |
-                                           CEPH_OSD_FLAG_WRITE,
-                                   snapc, do_sync,
-                                   truncate_seq, truncate_size, mtime,
-                                   nofail, 1);
-       if (!req)
-               return -ENOMEM;
-
-       /* it may be a short write due to an object boundary */
-       req->r_pages = pages;
-       dout("writepages %llu~%llu (%d pages)\n", off, len,
-            req->r_num_pages);
-
-       rc = ceph_osdc_start_request(osdc, req, nofail);
-       if (!rc)
-               rc = ceph_osdc_wait_request(osdc, req);
-
-       ceph_osdc_put_request(req);
-       if (rc == 0)
-               rc = len;
-       dout("writepages result %d\n", rc);
-       return rc;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc;
-       int type = le16_to_cpu(msg->hdr.type);
-
-       if (!osd)
-               goto out;
-       osdc = osd->o_osdc;
-
-       switch (type) {
-       case CEPH_MSG_OSD_MAP:
-               ceph_osdc_handle_map(osdc, msg);
-               break;
-       case CEPH_MSG_OSD_OPREPLY:
-               handle_reply(osdc, msg, con);
-               break;
-
-       default:
-               pr_err("received unknown message type %d %s\n", type,
-                      ceph_msg_type_name(type));
-       }
-out:
-       ceph_msg_put(msg);
-}
-
-/*
- * lookup and return message for incoming reply.  set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
-                                 struct ceph_msg_header *hdr,
-                                 int *skip)
-{
-       struct ceph_osd *osd = con->private;
-       struct ceph_osd_client *osdc = osd->o_osdc;
-       struct ceph_msg *m;
-       struct ceph_osd_request *req;
-       int front = le32_to_cpu(hdr->front_len);
-       int data_len = le32_to_cpu(hdr->data_len);
-       u64 tid;
-
-       tid = le64_to_cpu(hdr->tid);
-       mutex_lock(&osdc->request_mutex);
-       req = __lookup_request(osdc, tid);
-       if (!req) {
-               *skip = 1;
-               m = NULL;
-               pr_info("get_reply unknown tid %llu from osd%d\n", tid,
-                       osd->o_osd);
-               goto out;
-       }
-
-       if (req->r_con_filling_msg) {
-               dout("get_reply revoking msg %p from old con %p\n",
-                    req->r_reply, req->r_con_filling_msg);
-               ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-               ceph_con_put(req->r_con_filling_msg);
-               req->r_con_filling_msg = NULL;
-       }
-
-       if (front > req->r_reply->front.iov_len) {
-               pr_warning("get_reply front %d > preallocated %d\n",
-                          front, (int)req->r_reply->front.iov_len);
-               m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-               if (!m)
-                       goto out;
-               ceph_msg_put(req->r_reply);
-               req->r_reply = m;
-       }
-       m = ceph_msg_get(req->r_reply);
-
-       if (data_len > 0) {
-               unsigned data_off = le16_to_cpu(hdr->data_off);
-               int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
-               if (unlikely(req->r_num_pages < want)) {
-                       pr_warning("tid %lld reply %d > expected %d pages\n",
-                                  tid, want, m->nr_pages);
-                       *skip = 1;
-                       ceph_msg_put(m);
-                       m = NULL;
-                       goto out;
-               }
-               m->pages = req->r_pages;
-               m->nr_pages = req->r_num_pages;
-       }
-       *skip = 0;
-       req->r_con_filling_msg = ceph_con_get(con);
-       dout("get_reply tid %lld %p\n", tid, m);
-
-out:
-       mutex_unlock(&osdc->request_mutex);
-       return m;
-
-}
-
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-                                 struct ceph_msg_header *hdr,
-                                 int *skip)
-{
-       struct ceph_osd *osd = con->private;
-       int type = le16_to_cpu(hdr->type);
-       int front = le32_to_cpu(hdr->front_len);
-
-       switch (type) {
-       case CEPH_MSG_OSD_MAP:
-               return ceph_msg_new(type, front, GFP_NOFS);
-       case CEPH_MSG_OSD_OPREPLY:
-               return get_reply(con, hdr, skip);
-       default:
-               pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
-                       osd->o_osd);
-               *skip = 1;
-               return NULL;
-       }
-}
-
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
-       struct ceph_osd *osd = con->private;
-       if (get_osd(osd))
-               return con;
-       return NULL;
-}
-
-static void put_osd_con(struct ceph_connection *con)
-{
-       struct ceph_osd *osd = con->private;
-       put_osd(osd);
-}
-
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
-                         void **buf, int *len, int *proto,
-                         void **reply_buf, int *reply_len, int force_new)
-{
-       struct ceph_osd *o = con->private;
-       struct ceph_osd_client *osdc = o->o_osdc;
-       struct ceph_auth_client *ac = osdc->client->monc.auth;
-       int ret = 0;
-
-       if (force_new && o->o_authorizer) {
-               ac->ops->destroy_authorizer(ac, o->o_authorizer);
-               o->o_authorizer = NULL;
-       }
-       if (o->o_authorizer == NULL) {
-               ret = ac->ops->create_authorizer(
-                       ac, CEPH_ENTITY_TYPE_OSD,
-                       &o->o_authorizer,
-                       &o->o_authorizer_buf,
-                       &o->o_authorizer_buf_len,
-                       &o->o_authorizer_reply_buf,
-                       &o->o_authorizer_reply_buf_len);
-               if (ret)
-                       return ret;
-       }
-
-       *proto = ac->protocol;
-       *buf = o->o_authorizer_buf;
-       *len = o->o_authorizer_buf_len;
-       *reply_buf = o->o_authorizer_reply_buf;
-       *reply_len = o->o_authorizer_reply_buf_len;
-       return 0;
-}
-
-
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
-       struct ceph_osd *o = con->private;
-       struct ceph_osd_client *osdc = o->o_osdc;
-       struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-       return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-
-static int invalidate_authorizer(struct ceph_connection *con)
-{
-       struct ceph_osd *o = con->private;
-       struct ceph_osd_client *osdc = o->o_osdc;
-       struct ceph_auth_client *ac = osdc->client->monc.auth;
-
-       if (ac->ops->invalidate_authorizer)
-               ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
-       return ceph_monc_validate_auth(&osdc->client->monc);
-}
-
-static const struct ceph_connection_operations osd_con_ops = {
-       .get = get_osd_con,
-       .put = put_osd_con,
-       .dispatch = dispatch,
-       .get_authorizer = get_authorizer,
-       .verify_authorizer_reply = verify_authorizer_reply,
-       .invalidate_authorizer = invalidate_authorizer,
-       .alloc_msg = alloc_msg,
-       .fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h

deleted file mode 100644 (file)

index ce77698..0000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
-                                    struct ceph_msg *);
-
-/* a given osd we're communicating with */
-struct ceph_osd {
-       atomic_t o_ref;
-       struct ceph_osd_client *o_osdc;
-       int o_osd;
-       int o_incarnation;
-       struct rb_node o_node;
-       struct ceph_connection o_con;
-       struct list_head o_requests;
-       struct list_head o_osd_lru;
-       struct ceph_authorizer *o_authorizer;
-       void *o_authorizer_buf, *o_authorizer_reply_buf;
-       size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
-       unsigned long lru_ttl;
-       int o_marked_for_keepalive;
-       struct list_head o_keepalive_item;
-};
-
-/* an in-flight request */
-struct ceph_osd_request {
-       u64             r_tid;              /* unique for this client */
-       struct rb_node  r_node;
-       struct list_head r_req_lru_item;
-       struct list_head r_osd_item;
-       struct ceph_osd *r_osd;
-       struct ceph_pg   r_pgid;
-       int              r_pg_osds[CEPH_PG_MAX_SIZE];
-       int              r_num_pg_osds;
-
-       struct ceph_connection *r_con_filling_msg;
-
-       struct ceph_msg  *r_request, *r_reply;
-       int               r_result;
-       int               r_flags;     /* any additional flags for the osd */
-       u32               r_sent;      /* >0 if r_request is sending/sent */
-       int               r_got_reply;
-
-       struct ceph_osd_client *r_osdc;
-       struct kref       r_kref;
-       bool              r_mempool;
-       struct completion r_completion, r_safe_completion;
-       ceph_osdc_callback_t r_callback, r_safe_callback;
-       struct ceph_eversion r_reassert_version;
-       struct list_head  r_unsafe_item;
-
-       struct inode *r_inode;                /* for use by callbacks */
-
-       char              r_oid[40];          /* object name */
-       int               r_oid_len;
-       unsigned long     r_stamp;            /* send OR check time */
-       bool              r_resend;           /* msg send failed, needs retry */
-
-       struct ceph_file_layout r_file_layout;
-       struct ceph_snap_context *r_snapc;    /* snap context for writes */
-       unsigned          r_num_pages;        /* size of page array (follows) */
-       struct page     **r_pages;            /* pages for data payload */
-       int               r_pages_from_pool;
-       int               r_own_pages;        /* if true, i own page list */
-};
-
-struct ceph_osd_client {
-       struct ceph_client     *client;
-
-       struct ceph_osdmap     *osdmap;       /* current map */
-       struct rw_semaphore    map_sem;
-       struct completion      map_waiters;
-       u64                    last_requested_map;
-
-       struct mutex           request_mutex;
-       struct rb_root         osds;          /* osds */
-       struct list_head       osd_lru;       /* idle osds */
-       u64                    timeout_tid;   /* tid of timeout triggering rq */
-       u64                    last_tid;      /* tid of last request */
-       struct rb_root         requests;      /* pending requests */
-       struct list_head       req_lru;       /* pending requests lru */
-       int                    num_requests;
-       struct delayed_work    timeout_work;
-       struct delayed_work    osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
-       struct dentry          *debugfs_file;
-#endif
-
-       mempool_t              *req_mempool;
-
-       struct ceph_msgpool     msgpool_op;
-       struct ceph_msgpool     msgpool_op_reply;
-};
-
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-                         struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
-                                  struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
-                                struct ceph_msg *msg);
-
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
-                                     struct ceph_file_layout *layout,
-                                     struct ceph_vino vino,
-                                     u64 offset, u64 *len, int op, int flags,
-                                     struct ceph_snap_context *snapc,
-                                     int do_sync, u32 truncate_seq,
-                                     u64 truncate_size,
-                                     struct timespec *mtime,
-                                     bool use_mempool, int num_reply);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
-       kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
-       kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
-                                  struct ceph_osd_request *req,
-                                  bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
-                                 struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
-                              struct ceph_vino vino,
-                              struct ceph_file_layout *layout,
-                              u64 off, u64 *plen,
-                              u32 truncate_seq, u64 truncate_size,
-                              struct page **pages, int nr_pages);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
-                               struct ceph_vino vino,
-                               struct ceph_file_layout *layout,
-                               struct ceph_snap_context *sc,
-                               u64 off, u64 len,
-                               u32 truncate_seq, u64 truncate_size,
-                               struct timespec *mtime,
-                               struct page **pages, int nr_pages,
-                               int flags, int do_sync, bool nofail);
-
-#endif
-
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c

deleted file mode 100644 (file)

index e31f118..0000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-#include <asm/div64.h>
-
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
-       int flag = 0;
-
-       if (!len)
-               goto done;
-
-       *str = '\0';
-       if (state) {
-               if (state & CEPH_OSD_EXISTS) {
-                       snprintf(str, len, "exists");
-                       flag = 1;
-               }
-               if (state & CEPH_OSD_UP) {
-                       snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
-                                "up");
-                       flag = 1;
-               }
-       } else {
-               snprintf(str, len, "doesn't exist");
-       }
-done:
-       return str;
-}
-
-/* maps */
-
-static int calc_bits_of(unsigned t)
-{
-       int b = 0;
-       while (t) {
-               t = t >> 1;
-               b++;
-       }
-       return b;
-}
-
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
-       pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
-       pi->pgp_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
-       pi->lpg_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
-       pi->lpgp_num_mask =
-               (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
-                                      struct crush_bucket_uniform *b)
-{
-       dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
-       ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-       b->item_weight = ceph_decode_32(p);
-       return 0;
-bad:
-       return -EINVAL;
-}
-
-static int crush_decode_list_bucket(void **p, void *end,
-                                   struct crush_bucket_list *b)
-{
-       int j;
-       dout("crush_decode_list_bucket %p to %p\n", *p, end);
-       b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-       if (b->item_weights == NULL)
-               return -ENOMEM;
-       b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-       if (b->sum_weights == NULL)
-               return -ENOMEM;
-       ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-       for (j = 0; j < b->h.size; j++) {
-               b->item_weights[j] = ceph_decode_32(p);
-               b->sum_weights[j] = ceph_decode_32(p);
-       }
-       return 0;
-bad:
-       return -EINVAL;
-}
-
-static int crush_decode_tree_bucket(void **p, void *end,
-                                   struct crush_bucket_tree *b)
-{
-       int j;
-       dout("crush_decode_tree_bucket %p to %p\n", *p, end);
-       ceph_decode_32_safe(p, end, b->num_nodes, bad);
-       b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
-       if (b->node_weights == NULL)
-               return -ENOMEM;
-       ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
-       for (j = 0; j < b->num_nodes; j++)
-               b->node_weights[j] = ceph_decode_32(p);
-       return 0;
-bad:
-       return -EINVAL;
-}
-
-static int crush_decode_straw_bucket(void **p, void *end,
-                                    struct crush_bucket_straw *b)
-{
-       int j;
-       dout("crush_decode_straw_bucket %p to %p\n", *p, end);
-       b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-       if (b->item_weights == NULL)
-               return -ENOMEM;
-       b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
-       if (b->straws == NULL)
-               return -ENOMEM;
-       ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
-       for (j = 0; j < b->h.size; j++) {
-               b->item_weights[j] = ceph_decode_32(p);
-               b->straws[j] = ceph_decode_32(p);
-       }
-       return 0;
-bad:
-       return -EINVAL;
-}
-
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
-       struct crush_map *c;
-       int err = -EINVAL;
-       int i, j;
-       void **p = &pbyval;
-       void *start = pbyval;
-       u32 magic;
-
-       dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-       c = kzalloc(sizeof(*c), GFP_NOFS);
-       if (c == NULL)
-               return ERR_PTR(-ENOMEM);
-
-       ceph_decode_need(p, end, 4*sizeof(u32), bad);
-       magic = ceph_decode_32(p);
-       if (magic != CRUSH_MAGIC) {
-               pr_err("crush_decode magic %x != current %x\n",
-                      (unsigned)magic, (unsigned)CRUSH_MAGIC);
-               goto bad;
-       }
-       c->max_buckets = ceph_decode_32(p);
-       c->max_rules = ceph_decode_32(p);
-       c->max_devices = ceph_decode_32(p);
-
-       c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
-       if (c->device_parents == NULL)
-               goto badmem;
-       c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
-       if (c->bucket_parents == NULL)
-               goto badmem;
-
-       c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
-       if (c->buckets == NULL)
-               goto badmem;
-       c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
-       if (c->rules == NULL)
-               goto badmem;
-
-       /* buckets */
-       for (i = 0; i < c->max_buckets; i++) {
-               int size = 0;
-               u32 alg;
-               struct crush_bucket *b;
-
-               ceph_decode_32_safe(p, end, alg, bad);
-               if (alg == 0) {
-                       c->buckets[i] = NULL;
-                       continue;
-               }
-               dout("crush_decode bucket %d off %x %p to %p\n",
-                    i, (int)(*p-start), *p, end);
-
-               switch (alg) {
-               case CRUSH_BUCKET_UNIFORM:
-                       size = sizeof(struct crush_bucket_uniform);
-                       break;
-               case CRUSH_BUCKET_LIST:
-                       size = sizeof(struct crush_bucket_list);
-                       break;
-               case CRUSH_BUCKET_TREE:
-                       size = sizeof(struct crush_bucket_tree);
-                       break;
-               case CRUSH_BUCKET_STRAW:
-                       size = sizeof(struct crush_bucket_straw);
-                       break;
-               default:
-                       err = -EINVAL;
-                       goto bad;
-               }
-               BUG_ON(size == 0);
-               b = c->buckets[i] = kzalloc(size, GFP_NOFS);
-               if (b == NULL)
-                       goto badmem;
-
-               ceph_decode_need(p, end, 4*sizeof(u32), bad);
-               b->id = ceph_decode_32(p);
-               b->type = ceph_decode_16(p);
-               b->alg = ceph_decode_8(p);
-               b->hash = ceph_decode_8(p);
-               b->weight = ceph_decode_32(p);
-               b->size = ceph_decode_32(p);
-
-               dout("crush_decode bucket size %d off %x %p to %p\n",
-                    b->size, (int)(*p-start), *p, end);
-
-               b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
-               if (b->items == NULL)
-                       goto badmem;
-               b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
-               if (b->perm == NULL)
-                       goto badmem;
-               b->perm_n = 0;
-
-               ceph_decode_need(p, end, b->size*sizeof(u32), bad);
-               for (j = 0; j < b->size; j++)
-                       b->items[j] = ceph_decode_32(p);
-
-               switch (b->alg) {
-               case CRUSH_BUCKET_UNIFORM:
-                       err = crush_decode_uniform_bucket(p, end,
-                                 (struct crush_bucket_uniform *)b);
-                       if (err < 0)
-                               goto bad;
-                       break;
-               case CRUSH_BUCKET_LIST:
-                       err = crush_decode_list_bucket(p, end,
-                              (struct crush_bucket_list *)b);
-                       if (err < 0)
-                               goto bad;
-                       break;
-               case CRUSH_BUCKET_TREE:
-                       err = crush_decode_tree_bucket(p, end,
-                               (struct crush_bucket_tree *)b);
-                       if (err < 0)
-                               goto bad;
-                       break;
-               case CRUSH_BUCKET_STRAW:
-                       err = crush_decode_straw_bucket(p, end,
-                               (struct crush_bucket_straw *)b);
-                       if (err < 0)
-                               goto bad;
-                       break;
-               }
-       }
-
-       /* rules */
-       dout("rule vec is %p\n", c->rules);
-       for (i = 0; i < c->max_rules; i++) {
-               u32 yes;
-               struct crush_rule *r;
-
-               ceph_decode_32_safe(p, end, yes, bad);
-               if (!yes) {
-                       dout("crush_decode NO rule %d off %x %p to %p\n",
-                            i, (int)(*p-start), *p, end);
-                       c->rules[i] = NULL;
-                       continue;
-               }
-
-               dout("crush_decode rule %d off %x %p to %p\n",
-                    i, (int)(*p-start), *p, end);
-
-               /* len */
-               ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
-               err = -EINVAL;
-               if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
-                       goto bad;
-#endif
-               r = c->rules[i] = kmalloc(sizeof(*r) +
-                                         yes*sizeof(struct crush_rule_step),
-                                         GFP_NOFS);
-               if (r == NULL)
-                       goto badmem;
-               dout(" rule %d is at %p\n", i, r);
-               r->len = yes;
-               ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
-               ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
-               for (j = 0; j < r->len; j++) {
-                       r->steps[j].op = ceph_decode_32(p);
-                       r->steps[j].arg1 = ceph_decode_32(p);
-                       r->steps[j].arg2 = ceph_decode_32(p);
-               }
-       }
-
-       /* ignore trailing name maps. */
-
-       dout("crush_decode success\n");
-       return c;
-
-badmem:
-       err = -ENOMEM;
-bad:
-       dout("crush_decode fail %d\n", err);
-       crush_destroy(c);
-       return ERR_PTR(err);
-}
-
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
-       u64 a = *(u64 *)&l;
-       u64 b = *(u64 *)&r;
-
-       if (a < b)
-               return -1;
-       if (a > b)
-               return 1;
-       return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
-                              struct rb_root *root)
-{
-       struct rb_node **p = &root->rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_pg_mapping *pg = NULL;
-       int c;
-
-       while (*p) {
-               parent = *p;
-               pg = rb_entry(parent, struct ceph_pg_mapping, node);
-               c = pgid_cmp(new->pgid, pg->pgid);
-               if (c < 0)
-                       p = &(*p)->rb_left;
-               else if (c > 0)
-                       p = &(*p)->rb_right;
-               else
-                       return -EEXIST;
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, root);
-       return 0;
-}
-
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
-                                                  struct ceph_pg pgid)
-{
-       struct rb_node *n = root->rb_node;
-       struct ceph_pg_mapping *pg;
-       int c;
-
-       while (n) {
-               pg = rb_entry(n, struct ceph_pg_mapping, node);
-               c = pgid_cmp(pgid, pg->pgid);
-               if (c < 0)
-                       n = n->rb_left;
-               else if (c > 0)
-                       n = n->rb_right;
-               else
-                       return pg;
-       }
-       return NULL;
-}
-
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-       struct rb_node **p = &root->rb_node;
-       struct rb_node *parent = NULL;
-       struct ceph_pg_pool_info *pi = NULL;
-
-       while (*p) {
-               parent = *p;
-               pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-               if (new->id < pi->id)
-                       p = &(*p)->rb_left;
-               else if (new->id > pi->id)
-                       p = &(*p)->rb_right;
-               else
-                       return -EEXIST;
-       }
-
-       rb_link_node(&new->node, parent, p);
-       rb_insert_color(&new->node, root);
-       return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
-       struct ceph_pg_pool_info *pi;
-       struct rb_node *n = root->rb_node;
-
-       while (n) {
-               pi = rb_entry(n, struct ceph_pg_pool_info, node);
-               if (id < pi->id)
-                       n = n->rb_left;
-               else if (id > pi->id)
-                       n = n->rb_right;
-               else
-                       return pi;
-       }
-       return NULL;
-}
-
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
-       rb_erase(&pi->node, root);
-       kfree(pi->name);
-       kfree(pi);
-}
-
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
-{
-       unsigned n, m;
-
-       ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-       calc_pg_masks(pi);
-
-       /* num_snaps * snap_info_t */
-       n = le32_to_cpu(pi->v.num_snaps);
-       while (n--) {
-               ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
-                                sizeof(struct ceph_timespec), bad);
-               *p += sizeof(u64) +       /* key */
-                       1 + sizeof(u64) + /* u8, snapid */
-                       sizeof(struct ceph_timespec);
-               m = ceph_decode_32(p);    /* snap name */
-               *p += m;
-       }
-
-       *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
-       return 0;
-
-bad:
-       return -EINVAL;
-}
-
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
-       struct ceph_pg_pool_info *pi;
-       u32 num, len, pool;
-
-       ceph_decode_32_safe(p, end, num, bad);
-       dout(" %d pool names\n", num);
-       while (num--) {
-               ceph_decode_32_safe(p, end, pool, bad);
-               ceph_decode_32_safe(p, end, len, bad);
-               dout("  pool %d len %d\n", pool, len);
-               pi = __lookup_pg_pool(&map->pg_pools, pool);
-               if (pi) {
-                       kfree(pi->name);
-                       pi->name = kmalloc(len + 1, GFP_NOFS);
-                       if (pi->name) {
-                               memcpy(pi->name, *p, len);
-                               pi->name[len] = '\0';
-                               dout("  name is %s\n", pi->name);
-                       }
-               }
-               *p += len;
-       }
-       return 0;
-
-bad:
-       return -EINVAL;
-}
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-       dout("osdmap_destroy %p\n", map);
-       if (map->crush)
-               crush_destroy(map->crush);
-       while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-               struct ceph_pg_mapping *pg =
-                       rb_entry(rb_first(&map->pg_temp),
-                                struct ceph_pg_mapping, node);
-               rb_erase(&pg->node, &map->pg_temp);
-               kfree(pg);
-       }
-       while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-               struct ceph_pg_pool_info *pi =
-                       rb_entry(rb_first(&map->pg_pools),
-                                struct ceph_pg_pool_info, node);
-               __remove_pg_pool(&map->pg_pools, pi);
-       }
-       kfree(map->osd_state);
-       kfree(map->osd_weight);
-       kfree(map->osd_addr);
-       kfree(map);
-}
-
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-       u8 *state;
-       struct ceph_entity_addr *addr;
-       u32 *weight;
-
-       state = kcalloc(max, sizeof(*state), GFP_NOFS);
-       addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-       weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-       if (state == NULL || addr == NULL || weight == NULL) {
-               kfree(state);
-               kfree(addr);
-               kfree(weight);
-               return -ENOMEM;
-       }
-
-       /* copy old? */
-       if (map->osd_state) {
-               memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-               memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-               memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-               kfree(map->osd_state);
-               kfree(map->osd_addr);
-               kfree(map->osd_weight);
-       }
-
-       map->osd_state = state;
-       map->osd_weight = weight;
-       map->osd_addr = addr;
-       map->max_osd = max;
-       return 0;
-}
-
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
-       struct ceph_osdmap *map;
-       u16 version;
-       u32 len, max, i;
-       u8 ev;
-       int err = -EINVAL;
-       void *start = *p;
-       struct ceph_pg_pool_info *pi;
-
-       dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
-       map = kzalloc(sizeof(*map), GFP_NOFS);
-       if (map == NULL)
-               return ERR_PTR(-ENOMEM);
-       map->pg_temp = RB_ROOT;
-
-       ceph_decode_16_safe(p, end, version, bad);
-       if (version > CEPH_OSDMAP_VERSION) {
-               pr_warning("got unknown v %d > %d of osdmap\n", version,
-                          CEPH_OSDMAP_VERSION);
-               goto bad;
-       }
-
-       ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
-       ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-       map->epoch = ceph_decode_32(p);
-       ceph_decode_copy(p, &map->created, sizeof(map->created));
-       ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-
-       ceph_decode_32_safe(p, end, max, bad);
-       while (max--) {
-               ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-               pi = kzalloc(sizeof(*pi), GFP_NOFS);
-               if (!pi)
-                       goto bad;
-               pi->id = ceph_decode_32(p);
-               ev = ceph_decode_8(p); /* encoding version */
-               if (ev > CEPH_PG_POOL_VERSION) {
-                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                  ev, CEPH_PG_POOL_VERSION);
-                       kfree(pi);
-                       goto bad;
-               }
-               err = __decode_pool(p, end, pi);
-               if (err < 0)
-                       goto bad;
-               __insert_pg_pool(&map->pg_pools, pi);
-       }
-
-       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-               goto bad;
-
-       ceph_decode_32_safe(p, end, map->pool_max, bad);
-
-       ceph_decode_32_safe(p, end, map->flags, bad);
-
-       max = ceph_decode_32(p);
-
-       /* (re)alloc osd arrays */
-       err = osdmap_set_max_osd(map, max);
-       if (err < 0)
-               goto bad;
-       dout("osdmap_decode max_osd = %d\n", map->max_osd);
-
-       /* osds */
-       err = -EINVAL;
-       ceph_decode_need(p, end, 3*sizeof(u32) +
-                        map->max_osd*(1 + sizeof(*map->osd_weight) +
-                                      sizeof(*map->osd_addr)), bad);
-       *p += 4; /* skip length field (should match max) */
-       ceph_decode_copy(p, map->osd_state, map->max_osd);
-
-       *p += 4; /* skip length field (should match max) */
-       for (i = 0; i < map->max_osd; i++)
-               map->osd_weight[i] = ceph_decode_32(p);
-
-       *p += 4; /* skip length field (should match max) */
-       ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
-       for (i = 0; i < map->max_osd; i++)
-               ceph_decode_addr(&map->osd_addr[i]);
-
-       /* pg_temp */
-       ceph_decode_32_safe(p, end, len, bad);
-       for (i = 0; i < len; i++) {
-               int n, j;
-               struct ceph_pg pgid;
-               struct ceph_pg_mapping *pg;
-
-               ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-               ceph_decode_copy(p, &pgid, sizeof(pgid));
-               n = ceph_decode_32(p);
-               ceph_decode_need(p, end, n * sizeof(u32), bad);
-               err = -ENOMEM;
-               pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-               if (!pg)
-                       goto bad;
-               pg->pgid = pgid;
-               pg->len = n;
-               for (j = 0; j < n; j++)
-                       pg->osds[j] = ceph_decode_32(p);
-
-               err = __insert_pg_mapping(pg, &map->pg_temp);
-               if (err)
-                       goto bad;
-               dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
-       }
-
-       /* crush */
-       ceph_decode_32_safe(p, end, len, bad);
-       dout("osdmap_decode crush len %d from off 0x%x\n", len,
-            (int)(*p - start));
-       ceph_decode_need(p, end, len, bad);
-       map->crush = crush_decode(*p, end);
-       *p += len;
-       if (IS_ERR(map->crush)) {
-               err = PTR_ERR(map->crush);
-               map->crush = NULL;
-               goto bad;
-       }
-
-       /* ignore the rest of the map */
-       *p = end;
-
-       dout("osdmap_decode done %p %p\n", *p, end);
-       return map;
-
-bad:
-       dout("osdmap_decode fail\n");
-       ceph_osdmap_destroy(map);
-       return ERR_PTR(err);
-}
-
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                            struct ceph_osdmap *map,
-                                            struct ceph_messenger *msgr)
-{
-       struct crush_map *newcrush = NULL;
-       struct ceph_fsid fsid;
-       u32 epoch = 0;
-       struct ceph_timespec modified;
-       u32 len, pool;
-       __s32 new_pool_max, new_flags, max;
-       void *start = *p;
-       int err = -EINVAL;
-       u16 version;
-       struct rb_node *rbp;
-
-       ceph_decode_16_safe(p, end, version, bad);
-       if (version > CEPH_OSDMAP_INC_VERSION) {
-               pr_warning("got unknown v %d > %d of inc osdmap\n", version,
-                          CEPH_OSDMAP_INC_VERSION);
-               goto bad;
-       }
-
-       ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
-                        bad);
-       ceph_decode_copy(p, &fsid, sizeof(fsid));
-       epoch = ceph_decode_32(p);
-       BUG_ON(epoch != map->epoch+1);
-       ceph_decode_copy(p, &modified, sizeof(modified));
-       new_pool_max = ceph_decode_32(p);
-       new_flags = ceph_decode_32(p);
-
-       /* full map? */
-       ceph_decode_32_safe(p, end, len, bad);
-       if (len > 0) {
-               dout("apply_incremental full map len %d, %p to %p\n",
-                    len, *p, end);
-               return osdmap_decode(p, min(*p+len, end));
-       }
-
-       /* new crush? */
-       ceph_decode_32_safe(p, end, len, bad);
-       if (len > 0) {
-               dout("apply_incremental new crush map len %d, %p to %p\n",
-                    len, *p, end);
-               newcrush = crush_decode(*p, min(*p+len, end));
-               if (IS_ERR(newcrush))
-                       return ERR_CAST(newcrush);
-               *p += len;
-       }
-
-       /* new flags? */
-       if (new_flags >= 0)
-               map->flags = new_flags;
-       if (new_pool_max >= 0)
-               map->pool_max = new_pool_max;
-
-       ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
-       /* new max? */
-       max = ceph_decode_32(p);
-       if (max >= 0) {
-               err = osdmap_set_max_osd(map, max);
-               if (err < 0)
-                       goto bad;
-       }
-
-       map->epoch++;
-       map->modified = map->modified;
-       if (newcrush) {
-               if (map->crush)
-                       crush_destroy(map->crush);
-               map->crush = newcrush;
-               newcrush = NULL;
-       }
-
-       /* new_pool */
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               __u8 ev;
-               struct ceph_pg_pool_info *pi;
-
-               ceph_decode_32_safe(p, end, pool, bad);
-               ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
-               ev = ceph_decode_8(p);  /* encoding version */
-               if (ev > CEPH_PG_POOL_VERSION) {
-                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
-                                  ev, CEPH_PG_POOL_VERSION);
-                       goto bad;
-               }
-               pi = __lookup_pg_pool(&map->pg_pools, pool);
-               if (!pi) {
-                       pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                       if (!pi) {
-                               err = -ENOMEM;
-                               goto bad;
-                       }
-                       pi->id = pool;
-                       __insert_pg_pool(&map->pg_pools, pi);
-               }
-               err = __decode_pool(p, end, pi);
-               if (err < 0)
-                       goto bad;
-       }
-       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
-               goto bad;
-
-       /* old_pool */
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               struct ceph_pg_pool_info *pi;
-
-               ceph_decode_32_safe(p, end, pool, bad);
-               pi = __lookup_pg_pool(&map->pg_pools, pool);
-               if (pi)
-                       __remove_pg_pool(&map->pg_pools, pi);
-       }
-
-       /* new_up */
-       err = -EINVAL;
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               u32 osd;
-               struct ceph_entity_addr addr;
-               ceph_decode_32_safe(p, end, osd, bad);
-               ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
-               ceph_decode_addr(&addr);
-               pr_info("osd%d up\n", osd);
-               BUG_ON(osd >= map->max_osd);
-               map->osd_state[osd] |= CEPH_OSD_UP;
-               map->osd_addr[osd] = addr;
-       }
-
-       /* new_down */
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               u32 osd;
-               ceph_decode_32_safe(p, end, osd, bad);
-               (*p)++;  /* clean flag */
-               pr_info("osd%d down\n", osd);
-               if (osd < map->max_osd)
-                       map->osd_state[osd] &= ~CEPH_OSD_UP;
-       }
-
-       /* new_weight */
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               u32 osd, off;
-               ceph_decode_need(p, end, sizeof(u32)*2, bad);
-               osd = ceph_decode_32(p);
-               off = ceph_decode_32(p);
-               pr_info("osd%d weight 0x%x %s\n", osd, off,
-                    off == CEPH_OSD_IN ? "(in)" :
-                    (off == CEPH_OSD_OUT ? "(out)" : ""));
-               if (osd < map->max_osd)
-                       map->osd_weight[osd] = off;
-       }
-
-       /* new_pg_temp */
-       rbp = rb_first(&map->pg_temp);
-       ceph_decode_32_safe(p, end, len, bad);
-       while (len--) {
-               struct ceph_pg_mapping *pg;
-               int j;
-               struct ceph_pg pgid;
-               u32 pglen;
-               ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-               ceph_decode_copy(p, &pgid, sizeof(pgid));
-               pglen = ceph_decode_32(p);
-
-               /* remove any? */
-               while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
-                                               node)->pgid, pgid) <= 0) {
-                       struct ceph_pg_mapping *cur =
-                               rb_entry(rbp, struct ceph_pg_mapping, node);
-
-                       rbp = rb_next(rbp);
-                       dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-                       rb_erase(&cur->node, &map->pg_temp);
-                       kfree(cur);
-               }
-
-               if (pglen) {
-                       /* insert */
-                       ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-                       pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-                       if (!pg) {
-                               err = -ENOMEM;
-                               goto bad;
-                       }
-                       pg->pgid = pgid;
-                       pg->len = pglen;
-                       for (j = 0; j < pglen; j++)
-                               pg->osds[j] = ceph_decode_32(p);
-                       err = __insert_pg_mapping(pg, &map->pg_temp);
-                       if (err) {
-                               kfree(pg);
-                               goto bad;
-                       }
-                       dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
-                            pglen);
-               }
-       }
-       while (rbp) {
-               struct ceph_pg_mapping *cur =
-                       rb_entry(rbp, struct ceph_pg_mapping, node);
-
-               rbp = rb_next(rbp);
-               dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
-               rb_erase(&cur->node, &map->pg_temp);
-               kfree(cur);
-       }
-
-       /* ignore the rest */
-       *p = end;
-       return map;
-
-bad:
-       pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
-              epoch, (int)(*p - start), *p, start, end);
-       print_hex_dump(KERN_DEBUG, "osdmap: ",
-                      DUMP_PREFIX_OFFSET, 16, 1,
-                      start, end - start, true);
-       if (newcrush)
-               crush_destroy(newcrush);
-       return ERR_PTR(err);
-}
-
-
-
-
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                  u64 off, u64 *plen,
-                                  u64 *ono,
-                                  u64 *oxoff, u64 *oxlen)
-{
-       u32 osize = le32_to_cpu(layout->fl_object_size);
-       u32 su = le32_to_cpu(layout->fl_stripe_unit);
-       u32 sc = le32_to_cpu(layout->fl_stripe_count);
-       u32 bl, stripeno, stripepos, objsetno;
-       u32 su_per_object;
-       u64 t, su_offset;
-
-       dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
-            osize, su);
-       su_per_object = osize / su;
-       dout("osize %u / su %u = su_per_object %u\n", osize, su,
-            su_per_object);
-
-       BUG_ON((su & ~PAGE_MASK) != 0);
-       /* bl = *off / su; */
-       t = off;
-       do_div(t, su);
-       bl = t;
-       dout("off %llu / su %u = bl %u\n", off, su, bl);
-
-       stripeno = bl / sc;
-       stripepos = bl % sc;
-       objsetno = stripeno / su_per_object;
-
-       *ono = objsetno * sc + stripepos;
-       dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-
-       /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
-       t = off;
-       su_offset = do_div(t, su);
-       *oxoff = su_offset + (stripeno % su_per_object) * su;
-
-       /*
-        * Calculate the length of the extent being written to the selected
-        * object. This is the minimum of the full length requested (plen) or
-        * the remainder of the current stripe being written to.
-        */
-       *oxlen = min_t(u64, *plen, su - su_offset);
-       *plen = *oxlen;
-
-       dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                           const char *oid,
-                           struct ceph_file_layout *fl,
-                           struct ceph_osdmap *osdmap)
-{
-       unsigned num, num_mask;
-       struct ceph_pg pgid;
-       s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
-       int poolid = le32_to_cpu(fl->fl_pg_pool);
-       struct ceph_pg_pool_info *pool;
-       unsigned ps;
-
-       BUG_ON(!osdmap);
-
-       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-       if (!pool)
-               return -EIO;
-       ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
-       if (preferred >= 0) {
-               ps += preferred;
-               num = le32_to_cpu(pool->v.lpg_num);
-               num_mask = pool->lpg_num_mask;
-       } else {
-               num = le32_to_cpu(pool->v.pg_num);
-               num_mask = pool->pg_num_mask;
-       }
-
-       pgid.ps = cpu_to_le16(ps);
-       pgid.preferred = cpu_to_le16(preferred);
-       pgid.pool = fl->fl_pg_pool;
-       if (preferred >= 0)
-               dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
-                    (int)preferred);
-       else
-               dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-
-       ol->ol_pgid = pgid;
-       ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-       return 0;
-}
-
-/*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *osds, int *num)
-{
-       struct ceph_pg_mapping *pg;
-       struct ceph_pg_pool_info *pool;
-       int ruleno;
-       unsigned poolid, ps, pps;
-       int preferred;
-
-       /* pg_temp? */
-       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
-       if (pg) {
-               *num = pg->len;
-               return pg->osds;
-       }
-
-       /* crush */
-       poolid = le32_to_cpu(pgid.pool);
-       ps = le16_to_cpu(pgid.ps);
-       preferred = (s16)le16_to_cpu(pgid.preferred);
-
-       /* don't forcefeed bad device ids to crush */
-       if (preferred >= osdmap->max_osd ||
-           preferred >= osdmap->crush->max_devices)
-               preferred = -1;
-
-       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
-       if (!pool)
-               return NULL;
-       ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
-                                pool->v.type, pool->v.size);
-       if (ruleno < 0) {
-               pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                      poolid, pool->v.crush_ruleset, pool->v.type,
-                      pool->v.size);
-               return NULL;
-       }
-
-       if (preferred >= 0)
-               pps = ceph_stable_mod(ps,
-                                     le32_to_cpu(pool->v.lpgp_num),
-                                     pool->lpgp_num_mask);
-       else
-               pps = ceph_stable_mod(ps,
-                                     le32_to_cpu(pool->v.pgp_num),
-                                     pool->pgp_num_mask);
-       pps += poolid;
-       *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
-                            min_t(int, pool->v.size, *num),
-                            preferred, osdmap->osd_weight);
-       return osds;
-}
-
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                       int *acting)
-{
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, o, num = CEPH_PG_MAX_SIZE;
-
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
-
-       /* primary is first up osd */
-       o = 0;
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       acting[o++] = osds[i];
-       return o;
-}
-
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
-       int rawosds[CEPH_PG_MAX_SIZE], *osds;
-       int i, num = CEPH_PG_MAX_SIZE;
-
-       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
-       if (!osds)
-               return -1;
-
-       /* primary is first up osd */
-       for (i = 0; i < num; i++)
-               if (ceph_osd_is_up(osdmap, osds[i]))
-                       return osds[i];
-       return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h

deleted file mode 100644 (file)

index 970b547..0000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds.  That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
-       struct rb_node node;
-       int id;
-       struct ceph_pg_pool v;
-       int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
-       char *name;
-};
-
-struct ceph_pg_mapping {
-       struct rb_node node;
-       struct ceph_pg pgid;
-       int len;
-       int osds[];
-};
-
-struct ceph_osdmap {
-       struct ceph_fsid fsid;
-       u32 epoch;
-       u32 mkfs_epoch;
-       struct ceph_timespec created, modified;
-
-       u32 flags;         /* CEPH_OSDMAP_* */
-
-       u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
-       u8 *osd_state;     /* CEPH_OSD_* */
-       u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
-       struct ceph_entity_addr *osd_addr;
-
-       struct rb_root pg_temp;
-       struct rb_root pg_pools;
-       u32 pool_max;
-
-       /* the CRUSH map specifies the mapping of placement groups to
-        * the list of osds that store+replicate them. */
-       struct crush_map *crush;
-};
-
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-       ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-       ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
-       ((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
-       ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-       return le32_to_cpu(l->fl_stripe_unit) *
-               le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-       return le32_to_cpu(l->fl_object_size) *
-               le32_to_cpu(l->fl_stripe_count);
-}
-
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
-       return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
-       return map && (map->flags & flag);
-}
-
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
-                                                    int osd)
-{
-       if (osd >= map->max_osd)
-               return NULL;
-       return &map->osd_addr[osd];
-}
-
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
-                                           struct ceph_osdmap *map,
-                                           struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-                                         u64 off, u64 *plen,
-                                         u64 *bno, u64 *oxoff, u64 *oxlen);
-
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
-                                  const char *oid,
-                                  struct ceph_file_layout *fl,
-                                  struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                              int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
-                               struct ceph_pg pgid);
-
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c

deleted file mode 100644 (file)

index 46a368b..0000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
-       struct page *page = list_entry(pl->head.prev, struct page,
-                                      lru);
-       kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
-       if (pl->mapped_tail)
-               ceph_pagelist_unmap_tail(pl);
-
-       while (!list_empty(&pl->head)) {
-               struct page *page = list_first_entry(&pl->head, struct page,
-                                                    lru);
-               list_del(&page->lru);
-               __free_page(page);
-       }
-       return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
-       struct page *page = __page_cache_alloc(GFP_NOFS);
-       if (!page)
-               return -ENOMEM;
-       pl->room += PAGE_SIZE;
-       list_add_tail(&page->lru, &pl->head);
-       if (pl->mapped_tail)
-               ceph_pagelist_unmap_tail(pl);
-       pl->mapped_tail = kmap(page);
-       return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
-       while (pl->room < len) {
-               size_t bit = pl->room;
-               int ret;
-
-               memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
-                      buf, bit);
-               pl->length += bit;
-               pl->room -= bit;
-               buf += bit;
-               len -= bit;
-               ret = ceph_pagelist_addpage(pl);
-               if (ret)
-                       return ret;
-       }
-
-       memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
-       pl->length += len;
-       pl->room -= len;
-       return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h

deleted file mode 100644 (file)

index e8a4187..0000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-
-#include <linux/list.h>
-
-struct ceph_pagelist {
-       struct list_head head;
-       void *mapped_tail;
-       size_t length;
-       size_t room;
-};
-
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
-       INIT_LIST_HEAD(&pl->head);
-       pl->mapped_tail = NULL;
-       pl->length = 0;
-       pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
-
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
-       __le64 ev = cpu_to_le64(v);
-       return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
-       __le32 ev = cpu_to_le32(v);
-       return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
-       __le16 ev = cpu_to_le16(v);
-       return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
-       return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
-                                             char *s, size_t len)
-{
-       int ret = ceph_pagelist_encode_32(pl, len);
-       if (ret)
-               return ret;
-       if (len)
-               return ceph_pagelist_append(pl, s, len);
-       return 0;
-}
-
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h

deleted file mode 100644 (file)

index 6d5247f..0000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#ifndef CEPH_RADOS_H
-#define CEPH_RADOS_H
-
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-
-#include "msgr.h"
-
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION         5
-#define CEPH_OSDMAP_VERSION_EXT     5
-
-/*
- * fs id
- */
-struct ceph_fsid {
-       unsigned char fsid[16];
-};
-
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
-                                   const struct ceph_fsid *b)
-{
-       return memcmp(a, b, sizeof(*a));
-}
-
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
-
-struct ceph_timespec {
-       __le32 tv_sec;
-       __le32 tv_nsec;
-} __attribute__ ((packed));
-
-
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH     1
-#define CEPH_OBJECT_LAYOUT_LINEAR   2
-#define CEPH_OBJECT_LAYOUT_HASHINO  3
-
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH  0
-#define CEPH_PG_LAYOUT_HASH   1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-
-#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
-
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
-       __le16 preferred; /* preferred primary osd */
-       __le16 ps;        /* placement seed */
-       __le32 pool;      /* object pool */
-} __attribute__ ((packed));
-
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- *  pg_num -- base number of pseudorandomly placed pgs
- *
- *  pgp_num -- effective number when calculating pg placement.  this
- * is used for pg_num increases.  new pgs result in data being "split"
- * into new pgs.  for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split.  only _then_ are the new
- * pgs placed independently.
- *
- *  lpg_num -- localized pg count (per device).  replicas are randomly
- * selected.
- *
- *  lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP     1
-#define CEPH_PG_TYPE_RAID4   2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
-       __u8 type;                /* CEPH_PG_TYPE_* */
-       __u8 size;                /* number of osds in each pg */
-       __u8 crush_ruleset;       /* crush placement rule */
-       __u8 object_hash;         /* hash mapping object name to ps */
-       __le32 pg_num, pgp_num;   /* number of pg's */
-       __le32 lpg_num, lpgp_num; /* number of localized pg's */
-       __le32 last_change;       /* most recent epoch changed */
-       __le64 snap_seq;          /* seq for per-pool snapshot */
-       __le32 snap_epoch;        /* epoch of last snap */
-       __le32 num_snaps;
-       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-       __le64 auid;               /* who owns the pg */
-} __attribute__ ((packed));
-
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time.  b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
-       if ((x & bmask) < b)
-               return x & bmask;
-       else
-               return x & (bmask >> 1);
-}
-
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
-       struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
-       __le32 ol_stripe_unit;    /* for per-object parity, if any */
-} __attribute__ ((packed));
-
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
-       __le32 epoch;
-       __le64 version;
-} __attribute__ ((packed));
-
-/*
- * osd map bits
- */
-
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP     2
-
-/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN  0x10000
-#define CEPH_OSD_OUT 0
-
-
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
-
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE       0xf000
-#define CEPH_OSD_OP_MODE_RD    0x1000
-#define CEPH_OSD_OP_MODE_WR    0x2000
-#define CEPH_OSD_OP_MODE_RMW   0x3000
-#define CEPH_OSD_OP_MODE_SUB   0x4000
-
-#define CEPH_OSD_OP_TYPE       0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK  0x0100
-#define CEPH_OSD_OP_TYPE_DATA  0x0200
-#define CEPH_OSD_OP_TYPE_ATTR  0x0300
-#define CEPH_OSD_OP_TYPE_EXEC  0x0400
-#define CEPH_OSD_OP_TYPE_PG    0x0500
-
-enum {
-       /** data **/
-       /* read */
-       CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
-       CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-
-       /* fancy read */
-       CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-
-       /* write */
-       CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
-       CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
-       CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
-       CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
-       CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
-       /* fancy write */
-       CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
-       CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
-       CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
-       CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
-       CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
-       CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
-       CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-
-       CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
-       CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-
-       /** attrs **/
-       /* read */
-       CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
-       CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
-       CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-
-       /* write */
-       CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
-       CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
-       CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
-       CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
-       /** subop **/
-       CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
-       CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
-       CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
-       CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
-       CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
-
-       /** lock **/
-       CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
-       CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
-       CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
-       CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
-       CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
-       CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-
-       /** exec **/
-       CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-
-       /** pg **/
-       CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-
-static inline int ceph_osd_op_type_lock(int op)
-{
-       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
-       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
-       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
-       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
-       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-
-static inline int ceph_osd_op_mode_subop(int op)
-{
-       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
-       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
-       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-
-/*
- * note that the following tmap stuff is also defined in the ceph librados.h
- * any modification here needs to be updated there
- */
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM  'r'
-
-extern const char *ceph_osd_op_name(int op);
-
-
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
-       CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
-       CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
-       CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
-       CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
-       CEPH_OSD_FLAG_READ = 16,        /* op may read */
-       CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
-       CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
-       CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
-       CEPH_OSD_FLAG_BALANCE_READS = 256,
-       CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
-       CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
-       CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
-       CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
-};
-
-enum {
-       CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
-};
-
-#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-
-/* xattr comparison */
-enum {
-       CEPH_OSD_CMPXATTR_OP_NOP = 0,
-       CEPH_OSD_CMPXATTR_OP_EQ  = 1,
-       CEPH_OSD_CMPXATTR_OP_NE  = 2,
-       CEPH_OSD_CMPXATTR_OP_GT  = 3,
-       CEPH_OSD_CMPXATTR_OP_GTE = 4,
-       CEPH_OSD_CMPXATTR_OP_LT  = 5,
-       CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-
-enum {
-       CEPH_OSD_CMPXATTR_MODE_STRING = 1,
-       CEPH_OSD_CMPXATTR_MODE_U64    = 2
-};
-
-/*
- * an individual object operation.  each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
-       __le16 op;           /* CEPH_OSD_OP_* */
-       __le32 flags;        /* CEPH_OSD_FLAG_* */
-       union {
-               struct {
-                       __le64 offset, length;
-                       __le64 truncate_size;
-                       __le32 truncate_seq;
-               } __attribute__ ((packed)) extent;
-               struct {
-                       __le32 name_len;
-                       __le32 value_len;
-                       __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-                       __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-               } __attribute__ ((packed)) xattr;
-               struct {
-                       __u8 class_len;
-                       __u8 method_len;
-                       __u8 argc;
-                       __le32 indata_len;
-               } __attribute__ ((packed)) cls;
-               struct {
-                       __le64 cookie, count;
-               } __attribute__ ((packed)) pgls;
-               struct {
-                       __le64 snapid;
-               } __attribute__ ((packed)) snap;
-       };
-       __le32 payload_len;
-} __attribute__ ((packed));
-
-/*
- * osd request message header.  each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
-       __le32 client_inc;                 /* client incarnation */
-       struct ceph_object_layout layout;  /* pgid */
-       __le32 osdmap_epoch;               /* client's osdmap epoch */
-
-       __le32 flags;
-
-       struct ceph_timespec mtime;        /* for mutations only */
-       struct ceph_eversion reassert_version; /* if we are replaying op */
-
-       __le32 object_len;     /* length of object name */
-
-       __le64 snapid;         /* snapid to read */
-       __le64 snap_seq;       /* writer's snap context */
-       __le32 num_snaps;
-
-       __le16 num_ops;
-       struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
-       __le32 client_inc;                /* client incarnation */
-       __le32 flags;
-       struct ceph_object_layout layout;
-       __le32 osdmap_epoch;
-       struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
-       __le32 result;                    /* result code */
-
-       __le32 object_len;                /* length of object name */
-       __le32 num_ops;
-       struct ceph_osd_op ops[0];  /* ops[], object */
-} __attribute__ ((packed));
-
-
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c

index 190b6c4..39c243a 100644 (file)
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/sort.h>
  #include <linux/slab.h>
  
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
  
  /*
   * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                             struct ceph_cap_snap *capsnap)
  {
         struct inode *inode = &ci->vfs_inode;
-       struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+       struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
  
         BUG_ON(capsnap->writing);
         capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                       struct ceph_mds_session *session,
                       struct ceph_msg *msg)
  {
-       struct super_block *sb = mdsc->client->sb;
+       struct super_block *sb = mdsc->fsc->sb;
         int mds = session->s_mds;
         u64 split;
         int op;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c

new file mode 100644 (file)

index 0000000..cd5097d
--- /dev/null
+++ b/fs/ceph/strings.c
@@ -0,0 +1,117 @@
+/*
+ * Ceph fs string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+
+const char *ceph_mds_state_name(int s)
+{
+       switch (s) {
+               /* down and out */
+       case CEPH_MDS_STATE_DNE:        return "down:dne";
+       case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+               /* up and out */
+       case CEPH_MDS_STATE_BOOT:       return "up:boot";
+       case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+       case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+       case CEPH_MDS_STATE_CREATING:   return "up:creating";
+       case CEPH_MDS_STATE_STARTING:   return "up:starting";
+               /* up and in */
+       case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+       case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+       case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+       case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+       case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+       case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+       case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+       }
+       return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+       switch (op) {
+       case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+       case CEPH_SESSION_OPEN: return "open";
+       case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+       case CEPH_SESSION_CLOSE: return "close";
+       case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+       case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+       case CEPH_SESSION_STALE: return "stale";
+       case CEPH_SESSION_RECALL_STATE: return "recall_state";
+       }
+       return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+       switch (op) {
+       case CEPH_MDS_OP_LOOKUP:  return "lookup";
+       case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+       case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+       case CEPH_MDS_OP_GETATTR:  return "getattr";
+       case CEPH_MDS_OP_SETXATTR: return "setxattr";
+       case CEPH_MDS_OP_SETATTR: return "setattr";
+       case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+       case CEPH_MDS_OP_READDIR: return "readdir";
+       case CEPH_MDS_OP_MKNOD: return "mknod";
+       case CEPH_MDS_OP_LINK: return "link";
+       case CEPH_MDS_OP_UNLINK: return "unlink";
+       case CEPH_MDS_OP_RENAME: return "rename";
+       case CEPH_MDS_OP_MKDIR: return "mkdir";
+       case CEPH_MDS_OP_RMDIR: return "rmdir";
+       case CEPH_MDS_OP_SYMLINK: return "symlink";
+       case CEPH_MDS_OP_CREATE: return "create";
+       case CEPH_MDS_OP_OPEN: return "open";
+       case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+       case CEPH_MDS_OP_LSSNAP: return "lssnap";
+       case CEPH_MDS_OP_MKSNAP: return "mksnap";
+       case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+       case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+       case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
+       }
+       return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+       switch (op) {
+       case CEPH_CAP_OP_GRANT: return "grant";
+       case CEPH_CAP_OP_REVOKE: return "revoke";
+       case CEPH_CAP_OP_TRUNC: return "trunc";
+       case CEPH_CAP_OP_EXPORT: return "export";
+       case CEPH_CAP_OP_IMPORT: return "import";
+       case CEPH_CAP_OP_UPDATE: return "update";
+       case CEPH_CAP_OP_DROP: return "drop";
+       case CEPH_CAP_OP_FLUSH: return "flush";
+       case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+       case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+       case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+       case CEPH_CAP_OP_RELEASE: return "release";
+       case CEPH_CAP_OP_RENEW: return "renew";
+       }
+       return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+       switch (o) {
+       case CEPH_MDS_LEASE_REVOKE: return "revoke";
+       case CEPH_MDS_LEASE_RELEASE: return "release";
+       case CEPH_MDS_LEASE_RENEW: return "renew";
+       case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+       }
+       return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+       switch (o) {
+       case CEPH_SNAP_OP_UPDATE: return "update";
+       case CEPH_SNAP_OP_CREATE: return "create";
+       case CEPH_SNAP_OP_DESTROY: return "destroy";
+       case CEPH_SNAP_OP_SPLIT: return "split";
+       }
+       return "???";
+}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c

index 9922628..d6e0e04 100644 (file)
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <linux/backing-dev.h>
  #include <linux/ctype.h>
@@ -15,10 +15,13 @@
  #include <linux/statfs.h>
  #include <linux/string.h>
  
-#include "decode.h"
  #include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
  
  /*
   * Ceph superblock operations
@@ -26,36 +29,22 @@
   * Handle the basics of mounting, unmounting.
   */
  
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-       const char *e = s + len;
-
-       while (e != s && *(e-1) != '/')
-               e--;
-       return e;
-}
-
-
  /*
   * super ops
   */
  static void ceph_put_super(struct super_block *s)
  {
-       struct ceph_client *client = ceph_sb_to_client(s);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(s);
  
         dout("put_super\n");
-       ceph_mdsc_close_sessions(&client->mdsc);
+       ceph_mdsc_close_sessions(fsc->mdsc);
  
         /*
          * ensure we release the bdi before put_anon_super releases
          * the device name.
          */
-       if (s->s_bdi == &client->backing_dev_info) {
-               bdi_unregister(&client->backing_dev_info);
+       if (s->s_bdi == &fsc->backing_dev_info) {
+               bdi_unregister(&fsc->backing_dev_info);
                 s->s_bdi = NULL;
         }
  
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
  
  static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
-       struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
-       struct ceph_monmap *monmap = client->monc.monmap;
+       struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+       struct ceph_monmap *monmap = fsc->client->monc.monmap;
         struct ceph_statfs st;
         u64 fsid;
         int err;
  
         dout("statfs\n");
-       err = ceph_monc_do_statfs(&client->monc, &st);
+       err = ceph_monc_do_statfs(&fsc->client->monc, &st);
         if (err < 0)
                 return err;
  
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
  
  static int ceph_sync_fs(struct super_block *sb, int wait)
  {
-       struct ceph_client *client = ceph_sb_to_client(sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
  
         if (!wait) {
                 dout("sync_fs (non-blocking)\n");
-               ceph_flush_dirty_caps(&client->mdsc);
+               ceph_flush_dirty_caps(fsc->mdsc);
                 dout("sync_fs (non-blocking) done\n");
                 return 0;
         }
  
         dout("sync_fs (blocking)\n");
-       ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-       ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+       ceph_osdc_sync(&fsc->client->osdc);
+       ceph_mdsc_sync(fsc->mdsc);
         dout("sync_fs (blocking) done\n");
         return 0;
  }
  
-static int default_congestion_kb(void)
-{
-       int congestion_kb;
-
-       /*
-        * Copied from NFS
-        *
-        * congestion size, scale with available memory.
-        *
-        *  64MB:    8192k
-        * 128MB:   11585k
-        * 256MB:   16384k
-        * 512MB:   23170k
-        *   1GB:   32768k
-        *   2GB:   46340k
-        *   4GB:   65536k
-        *   8GB:   92681k
-        *  16GB:  131072k
-        *
-        * This allows larger machines to have larger/more transfers.
-        * Limit the default to 256M
-        */
-       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-       if (congestion_kb > 256*1024)
-               congestion_kb = 256*1024;
-
-       return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-       struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-       struct ceph_mount_args *args = client->mount_args;
-
-       if (args->flags & CEPH_OPT_FSID)
-               seq_printf(m, ",fsid=%pU", &args->fsid);
-       if (args->flags & CEPH_OPT_NOSHARE)
-               seq_puts(m, ",noshare");
-       if (args->flags & CEPH_OPT_DIRSTAT)
-               seq_puts(m, ",dirstat");
-       if ((args->flags & CEPH_OPT_RBYTES) == 0)
-               seq_puts(m, ",norbytes");
-       if (args->flags & CEPH_OPT_NOCRC)
-               seq_puts(m, ",nocrc");
-       if (args->flags & CEPH_OPT_NOASYNCREADDIR)
-               seq_puts(m, ",noasyncreaddir");
-
-       if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-               seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
-       if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-               seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
-       if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
-               seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
-       if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-               seq_printf(m, ",osdkeepalivetimeout=%d",
-                        args->osd_keepalive_timeout);
-       if (args->wsize)
-               seq_printf(m, ",wsize=%d", args->wsize);
-       if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
-               seq_printf(m, ",rsize=%d", args->rsize);
-       if (args->congestion_kb != default_congestion_kb())
-               seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
-       if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_min=%d",
-                        args->caps_wanted_delay_min);
-       if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
-               seq_printf(m, ",caps_wanted_delay_max=%d",
-                          args->caps_wanted_delay_max);
-       if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
-               seq_printf(m, ",cap_release_safety=%d",
-                          args->cap_release_safety);
-       if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
-               seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
-       if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
-               seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
-       if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-               seq_printf(m, ",snapdirname=%s", args->snapdir_name);
-       if (args->name)
-               seq_printf(m, ",name=%s", args->name);
-       if (args->secret)
-               seq_puts(m, ",secret=<hidden>");
-       return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
-       struct ceph_inode_info *ci = foo;
-       inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
-       ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
-                                     sizeof(struct ceph_inode_info),
-                                     __alignof__(struct ceph_inode_info),
-                                     (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                     ceph_inode_init_once);
-       if (ceph_inode_cachep == NULL)
-               return -ENOMEM;
-
-       ceph_cap_cachep = KMEM_CACHE(ceph_cap,
-                                    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_cap_cachep == NULL)
-               goto bad_cap;
-
-       ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_dentry_cachep == NULL)
-               goto bad_dentry;
-
-       ceph_file_cachep = KMEM_CACHE(ceph_file_info,
-                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
-       if (ceph_file_cachep == NULL)
-               goto bad_file;
-
-       return 0;
-
-bad_file:
-       kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
-       kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
-       kmem_cache_destroy(ceph_inode_cachep);
-       return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
-       kmem_cache_destroy(ceph_inode_cachep);
-       kmem_cache_destroy(ceph_cap_cachep);
-       kmem_cache_destroy(ceph_dentry_cachep);
-       kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount.  Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
-       struct ceph_client *client = ceph_sb_to_client(sb);
-
-       dout("ceph_umount_begin - starting forced umount\n");
-       if (!client)
-               return;
-       client->mount_state = CEPH_MOUNT_SHUTDOWN;
-       return;
-}
-
-static const struct super_operations ceph_super_ops = {
-       .alloc_inode    = ceph_alloc_inode,
-       .destroy_inode  = ceph_destroy_inode,
-       .write_inode    = ceph_write_inode,
-       .sync_fs        = ceph_sync_fs,
-       .put_super      = ceph_put_super,
-       .show_options   = ceph_show_options,
-       .statfs         = ceph_statfs,
-       .umount_begin   = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
-       switch (type) {
-       case CEPH_MSG_SHUTDOWN: return "shutdown";
-       case CEPH_MSG_PING: return "ping";
-       case CEPH_MSG_AUTH: return "auth";
-       case CEPH_MSG_AUTH_REPLY: return "auth_reply";
-       case CEPH_MSG_MON_MAP: return "mon_map";
-       case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
-       case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
-       case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-       case CEPH_MSG_STATFS: return "statfs";
-       case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-       case CEPH_MSG_MDS_MAP: return "mds_map";
-       case CEPH_MSG_CLIENT_SESSION: return "client_session";
-       case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
-       case CEPH_MSG_CLIENT_REQUEST: return "client_request";
-       case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
-       case CEPH_MSG_CLIENT_REPLY: return "client_reply";
-       case CEPH_MSG_CLIENT_CAPS: return "client_caps";
-       case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
-       case CEPH_MSG_CLIENT_SNAP: return "client_snap";
-       case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-       case CEPH_MSG_OSD_MAP: return "osd_map";
-       case CEPH_MSG_OSD_OP: return "osd_op";
-       case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-       default: return "unknown";
-       }
-}
-
-
  /*
   * mount options
   */
  enum {
         Opt_wsize,
         Opt_rsize,
-       Opt_osdtimeout,
-       Opt_osdkeepalivetimeout,
-       Opt_mount_timeout,
-       Opt_osd_idle_ttl,
         Opt_caps_wanted_delay_min,
         Opt_caps_wanted_delay_max,
         Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
         Opt_congestion_kb,
         Opt_last_int,
         /* int args above */
-       Opt_fsid,
         Opt_snapdirname,
-       Opt_name,
-       Opt_secret,
         Opt_last_string,
         /* string args above */
-       Opt_ip,
-       Opt_noshare,
         Opt_dirstat,
         Opt_nodirstat,
         Opt_rbytes,
         Opt_norbytes,
-       Opt_nocrc,
         Opt_noasyncreaddir,
  };
  
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
         {Opt_wsize, "wsize=%d"},
         {Opt_rsize, "rsize=%d"},
-       {Opt_osdtimeout, "osdtimeout=%d"},
-       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
-       {Opt_mount_timeout, "mount_timeout=%d"},
-       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
         {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
         {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
         {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
         {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
         {Opt_congestion_kb, "write_congestion_kb=%d"},
         /* int args above */
-       {Opt_fsid, "fsid=%s"},
         {Opt_snapdirname, "snapdirname=%s"},
-       {Opt_name, "name=%s"},
-       {Opt_secret, "secret=%s"},
         /* string args above */
-       {Opt_ip, "ip=%s"},
-       {Opt_noshare, "noshare"},
         {Opt_dirstat, "dirstat"},
         {Opt_nodirstat, "nodirstat"},
         {Opt_rbytes, "rbytes"},
         {Opt_norbytes, "norbytes"},
-       {Opt_nocrc, "nocrc"},
         {Opt_noasyncreaddir, "noasyncreaddir"},
         {-1, NULL}
  };
  
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
  {
-       int i = 0;
-       char tmp[3];
-       int err = -EINVAL;
-       int d;
-
-       dout("parse_fsid '%s'\n", str);
-       tmp[2] = 0;
-       while (*str && i < 16) {
-               if (ispunct(*str)) {
-                       str++;
-                       continue;
+       struct ceph_mount_options *fsopt = private;
+       substring_t argstr[MAX_OPT_ARGS];
+       int token, intval, ret;
+
+       token = match_token((char *)c, fsopt_tokens, argstr);
+       if (token < 0)
+               return -EINVAL;
+
+       if (token < Opt_last_int) {
+               ret = match_int(&argstr[0], &intval);
+               if (ret < 0) {
+                       pr_err("bad mount option arg (not int) "
+                              "at '%s'\n", c);
+                       return ret;
                 }
-               if (!isxdigit(str[0]) || !isxdigit(str[1]))
-                       break;
-               tmp[0] = str[0];
-               tmp[1] = str[1];
-               if (sscanf(tmp, "%x", &d) < 1)
-                       break;
-               fsid->fsid[i] = d & 0xff;
-               i++;
-               str += 2;
+               dout("got int token %d val %d\n", token, intval);
+       } else if (token > Opt_last_int && token < Opt_last_string) {
+               dout("got string token %d val %s\n", token,
+                    argstr[0].from);
+       } else {
+               dout("got token %d\n", token);
         }
  
-       if (i == 16)
-               err = 0;
-       dout("parse_fsid ret %d got fsid %pU", err, fsid);
-       return err;
+       switch (token) {
+       case Opt_snapdirname:
+               kfree(fsopt->snapdir_name);
+               fsopt->snapdir_name = kstrndup(argstr[0].from,
+                                              argstr[0].to-argstr[0].from,
+                                              GFP_KERNEL);
+               if (!fsopt->snapdir_name)
+                       return -ENOMEM;
+               break;
+
+               /* misc */
+       case Opt_wsize:
+               fsopt->wsize = intval;
+               break;
+       case Opt_rsize:
+               fsopt->rsize = intval;
+               break;
+       case Opt_caps_wanted_delay_min:
+               fsopt->caps_wanted_delay_min = intval;
+               break;
+       case Opt_caps_wanted_delay_max:
+               fsopt->caps_wanted_delay_max = intval;
+               break;
+       case Opt_readdir_max_entries:
+               fsopt->max_readdir = intval;
+               break;
+       case Opt_readdir_max_bytes:
+               fsopt->max_readdir_bytes = intval;
+               break;
+       case Opt_congestion_kb:
+               fsopt->congestion_kb = intval;
+               break;
+       case Opt_dirstat:
+               fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+               break;
+       case Opt_nodirstat:
+               fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+               break;
+       case Opt_rbytes:
+               fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+               break;
+       case Opt_norbytes:
+               fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+               break;
+       case Opt_noasyncreaddir:
+               fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+               break;
+       default:
+               BUG_ON(token);
+       }
+       return 0;
  }
  
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
-                                               const char *dev_name,
-                                               const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
  {
-       struct ceph_mount_args *args;
-       const char *c;
-       int err = -ENOMEM;
-       substring_t argstr[MAX_OPT_ARGS];
+       dout("destroy_mount_options %p\n", args);
+       kfree(args->snapdir_name);
+       kfree(args);
+}
  
-       args = kzalloc(sizeof(*args), GFP_KERNEL);
-       if (!args)
-               return ERR_PTR(-ENOMEM);
-       args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
-                                GFP_KERNEL);
-       if (!args->mon_addr)
-               goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+       if (!s1 && !s2)
+               return 0;
+       if (s1 && !s2)
+               return -1;
+       if (!s1 && s2)
+               return 1;
+       return strcmp(s1, s2);
+}
  
-       dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
-       /* start with defaults */
-       args->sb_flags = flags;
-       args->flags = CEPH_OPT_DEFAULT;
-       args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
-       args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-       args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-       args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
-       args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
-       args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
-       args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
-       args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-       args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-       args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
-       args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-       args->congestion_kb = default_congestion_kb();
-
-       /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
-       err = -EINVAL;
-       if (!dev_name)
-               goto out;
-       *path = strstr(dev_name, ":/");
-       if (*path == NULL) {
-               pr_err("device name is missing path (no :/ in %s)\n",
-                      dev_name);
-               goto out;
-       }
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+                                struct ceph_options *new_opt,
+                                struct ceph_fs_client *fsc)
+{
+       struct ceph_mount_options *fsopt1 = new_fsopt;
+       struct ceph_mount_options *fsopt2 = fsc->mount_options;
+       int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+       int ret;
  
-       /* get mon ip(s) */
-       err = ceph_parse_ips(dev_name, *path, args->mon_addr,
-                            CEPH_MAX_MON, &args->num_mon);
-       if (err < 0)
-               goto out;
+       ret = memcmp(fsopt1, fsopt2, ofs);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+       if (ret)
+               return ret;
+
+       return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+                              struct ceph_options **popt,
+                              int flags, char *options,
+                              const char *dev_name,
+                              const char **path)
+{
+       struct ceph_mount_options *fsopt;
+       const char *dev_name_end;
+       int err = -ENOMEM;
+
+       fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+       if (!fsopt)
+               return -ENOMEM;
+
+       dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+        fsopt->sb_flags = flags;
+        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+        fsopt->congestion_kb = default_congestion_kb();
+       
+        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+        err = -EINVAL;
+        if (!dev_name)
+                goto out;
+        *path = strstr(dev_name, ":/");
+        if (*path == NULL) {
+                pr_err("device name is missing path (no :/ in %s)\n",
+                       dev_name);
+                goto out;
+        }
+       dev_name_end = *path;
+       dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
  
         /* path on server */
         *path += 2;
         dout("server path '%s'\n", *path);
  
-       /* parse mount options */
-       while ((c = strsep(&options, ",")) != NULL) {
-               int token, intval, ret;
-               if (!*c)
-                       continue;
-               err = -EINVAL;
-               token = match_token((char *)c, arg_tokens, argstr);
-               if (token < 0) {
-                       pr_err("bad mount option at '%s'\n", c);
-                       goto out;
-               }
-               if (token < Opt_last_int) {
-                       ret = match_int(&argstr[0], &intval);
-                       if (ret < 0) {
-                               pr_err("bad mount option arg (not int) "
-                                      "at '%s'\n", c);
-                               continue;
-                       }
-                       dout("got int token %d val %d\n", token, intval);
-               } else if (token > Opt_last_int && token < Opt_last_string) {
-                       dout("got string token %d val %s\n", token,
-                            argstr[0].from);
-               } else {
-                       dout("got token %d\n", token);
-               }
-               switch (token) {
-               case Opt_ip:
-                       err = ceph_parse_ips(argstr[0].from,
-                                            argstr[0].to,
-                                            &args->my_addr,
-                                            1, NULL);
-                       if (err < 0)
-                               goto out;
-                       args->flags |= CEPH_OPT_MYIP;
-                       break;
-
-               case Opt_fsid:
-                       err = parse_fsid(argstr[0].from, &args->fsid);
-                       if (err == 0)
-                               args->flags |= CEPH_OPT_FSID;
-                       break;
-               case Opt_snapdirname:
-                       kfree(args->snapdir_name);
-                       args->snapdir_name = kstrndup(argstr[0].from,
-                                             argstr[0].to-argstr[0].from,
-                                             GFP_KERNEL);
-                       break;
-               case Opt_name:
-                       args->name = kstrndup(argstr[0].from,
-                                             argstr[0].to-argstr[0].from,
-                                             GFP_KERNEL);
-                       break;
-               case Opt_secret:
-                       args->secret = kstrndup(argstr[0].from,
-                                               argstr[0].to-argstr[0].from,
-                                               GFP_KERNEL);
-                       break;
-
-                       /* misc */
-               case Opt_wsize:
-                       args->wsize = intval;
-                       break;
-               case Opt_rsize:
-                       args->rsize = intval;
-                       break;
-               case Opt_osdtimeout:
-                       args->osd_timeout = intval;
-                       break;
-               case Opt_osdkeepalivetimeout:
-                       args->osd_keepalive_timeout = intval;
-                       break;
-               case Opt_osd_idle_ttl:
-                       args->osd_idle_ttl = intval;
-                       break;
-               case Opt_mount_timeout:
-                       args->mount_timeout = intval;
-                       break;
-               case Opt_caps_wanted_delay_min:
-                       args->caps_wanted_delay_min = intval;
-                       break;
-               case Opt_caps_wanted_delay_max:
-                       args->caps_wanted_delay_max = intval;
-                       break;
-               case Opt_readdir_max_entries:
-                       args->max_readdir = intval;
-                       break;
-               case Opt_readdir_max_bytes:
-                       args->max_readdir_bytes = intval;
-                       break;
-               case Opt_congestion_kb:
-                       args->congestion_kb = intval;
-                       break;
-
-               case Opt_noshare:
-                       args->flags |= CEPH_OPT_NOSHARE;
-                       break;
-
-               case Opt_dirstat:
-                       args->flags |= CEPH_OPT_DIRSTAT;
-                       break;
-               case Opt_nodirstat:
-                       args->flags &= ~CEPH_OPT_DIRSTAT;
-                       break;
-               case Opt_rbytes:
-                       args->flags |= CEPH_OPT_RBYTES;
-                       break;
-               case Opt_norbytes:
-                       args->flags &= ~CEPH_OPT_RBYTES;
-                       break;
-               case Opt_nocrc:
-                       args->flags |= CEPH_OPT_NOCRC;
-                       break;
-               case Opt_noasyncreaddir:
-                       args->flags |= CEPH_OPT_NOASYNCREADDIR;
-                       break;
-
-               default:
-                       BUG_ON(token);
-               }
-       }
-       return args;
+       err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+                                parse_fsopt_token, (void *)fsopt);
+       if (err)
+               goto out;
+
+       /* success */
+       *pfsopt = fsopt;
+       return 0;
  
  out:
-       kfree(args->mon_addr);
-       kfree(args);
-       return ERR_PTR(err);
+       destroy_mount_options(fsopt);
+       return err;
  }
  
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
  {
-       dout("destroy_mount_args %p\n", args);
-       kfree(args->snapdir_name);
-       args->snapdir_name = NULL;
-       kfree(args->name);
-       args->name = NULL;
-       kfree(args->secret);
-       args->secret = NULL;
-       kfree(args);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+       struct ceph_mount_options *fsopt = fsc->mount_options;
+       struct ceph_options *opt = fsc->client->options;
+
+       if (opt->flags & CEPH_OPT_FSID)
+               seq_printf(m, ",fsid=%pU", &opt->fsid);
+       if (opt->flags & CEPH_OPT_NOSHARE)
+               seq_puts(m, ",noshare");
+       if (opt->flags & CEPH_OPT_NOCRC)
+               seq_puts(m, ",nocrc");
+
+       if (opt->name)
+               seq_printf(m, ",name=%s", opt->name);
+       if (opt->secret)
+               seq_puts(m, ",secret=<hidden>");
+
+       if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+               seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+       if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+               seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+       if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+               seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+       if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+               seq_printf(m, ",osdkeepalivetimeout=%d",
+                          opt->osd_keepalive_timeout);
+
+       if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+               seq_puts(m, ",dirstat");
+       if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+               seq_puts(m, ",norbytes");
+       if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+               seq_puts(m, ",noasyncreaddir");
+
+       if (fsopt->wsize)
+               seq_printf(m, ",wsize=%d", fsopt->wsize);
+       if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+               seq_printf(m, ",rsize=%d", fsopt->rsize);
+       if (fsopt->congestion_kb != default_congestion_kb())
+               seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+       if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_min=%d",
+                        fsopt->caps_wanted_delay_min);
+       if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+               seq_printf(m, ",caps_wanted_delay_max=%d",
+                          fsopt->caps_wanted_delay_max);
+       if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+               seq_printf(m, ",cap_release_safety=%d",
+                          fsopt->cap_release_safety);
+       if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+               seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+       if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+               seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+       if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+               seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+       return 0;
  }
  
  /*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
   */
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
  {
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc = client->private;
+       int type = le16_to_cpu(msg->hdr.type);
+
+       switch (type) {
+       case CEPH_MSG_MDS_MAP:
+               ceph_mdsc_handle_map(fsc->mdsc, msg);
+               return 0;
+
+       default:
+               return -1;
+       }
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+                                       struct ceph_options *opt)
+{
+       struct ceph_fs_client *fsc;
         int err = -ENOMEM;
  
-       client = kzalloc(sizeof(*client), GFP_KERNEL);
-       if (client == NULL)
+       fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+       if (!fsc)
                 return ERR_PTR(-ENOMEM);
  
-       mutex_init(&client->mount_mutex);
-
-       init_waitqueue_head(&client->auth_wq);
+       fsc->client = ceph_create_client(opt, fsc);
+       if (IS_ERR(fsc->client)) {
+               err = PTR_ERR(fsc->client);
+               goto fail;
+       }
+       fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+       fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+       fsc->client->monc.want_mdsmap = 1;
  
-       client->sb = NULL;
-       client->mount_state = CEPH_MOUNT_MOUNTING;
-       client->mount_args = args;
+       fsc->mount_options = fsopt;
  
-       client->msgr = NULL;
+       fsc->sb = NULL;
+       fsc->mount_state = CEPH_MOUNT_MOUNTING;
  
-       client->auth_err = 0;
-       atomic_long_set(&client->writeback_count, 0);
+       atomic_long_set(&fsc->writeback_count, 0);
  
-       err = bdi_init(&client->backing_dev_info);
+       err = bdi_init(&fsc->backing_dev_info);
         if (err < 0)
-               goto fail;
+               goto fail_client;
  
         err = -ENOMEM;
-       client->wb_wq = create_workqueue("ceph-writeback");
-       if (client->wb_wq == NULL)
+       fsc->wb_wq = create_workqueue("ceph-writeback");
+       if (fsc->wb_wq == NULL)
                 goto fail_bdi;
-       client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
-       if (client->pg_inv_wq == NULL)
+       fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+       if (fsc->pg_inv_wq == NULL)
                 goto fail_wb_wq;
-       client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
-       if (client->trunc_wq == NULL)
+       fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+       if (fsc->trunc_wq == NULL)
                 goto fail_pg_inv_wq;
  
         /* set up mempools */
         err = -ENOMEM;
-       client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-                             client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-       if (!client->wb_pagevec_pool)
+       fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+                             fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+       if (!fsc->wb_pagevec_pool)
                 goto fail_trunc_wq;
  
         /* caps */
-       client->min_caps = args->max_readdir;
+       fsc->min_caps = fsopt->max_readdir;
+
+       return fsc;
  
-       /* subsystems */
-       err = ceph_monc_init(&client->monc, client);
-       if (err < 0)
-               goto fail_mempool;
-       err = ceph_osdc_init(&client->osdc, client);
-       if (err < 0)
-               goto fail_monc;
-       err = ceph_mdsc_init(&client->mdsc, client);
-       if (err < 0)
-               goto fail_osdc;
-       return client;
-
-fail_osdc:
-       ceph_osdc_stop(&client->osdc);
-fail_monc:
-       ceph_monc_stop(&client->monc);
-fail_mempool:
-       mempool_destroy(client->wb_pagevec_pool);
  fail_trunc_wq:
-       destroy_workqueue(client->trunc_wq);
+       destroy_workqueue(fsc->trunc_wq);
  fail_pg_inv_wq:
-       destroy_workqueue(client->pg_inv_wq);
+       destroy_workqueue(fsc->pg_inv_wq);
  fail_wb_wq:
-       destroy_workqueue(client->wb_wq);
+       destroy_workqueue(fsc->wb_wq);
  fail_bdi:
-       bdi_destroy(&client->backing_dev_info);
+       bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+       ceph_destroy_client(fsc->client);
  fail:
-       kfree(client);
+       kfree(fsc);
         return ERR_PTR(err);
  }
  
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
  {
-       dout("destroy_client %p\n", client);
+       dout("destroy_fs_client %p\n", fsc);
  
-       /* unmount */
-       ceph_mdsc_stop(&client->mdsc);
-       ceph_osdc_stop(&client->osdc);
+       destroy_workqueue(fsc->wb_wq);
+       destroy_workqueue(fsc->pg_inv_wq);
+       destroy_workqueue(fsc->trunc_wq);
  
-       /*
-        * make sure mds and osd connections close out before destroying
-        * the auth module, which is needed to free those connections'
-        * ceph_authorizers.
-        */
-       ceph_msgr_flush();
-
-       ceph_monc_stop(&client->monc);
+       bdi_destroy(&fsc->backing_dev_info);
  
-       ceph_debugfs_client_cleanup(client);
-       destroy_workqueue(client->wb_wq);
-       destroy_workqueue(client->pg_inv_wq);
-       destroy_workqueue(client->trunc_wq);
+       mempool_destroy(fsc->wb_pagevec_pool);
  
-       bdi_destroy(&client->backing_dev_info);
+       destroy_mount_options(fsc->mount_options);
  
-       if (client->msgr)
-               ceph_messenger_destroy(client->msgr);
-       mempool_destroy(client->wb_pagevec_pool);
+       ceph_fs_debugfs_cleanup(fsc);
  
-       destroy_mount_args(client->mount_args);
+       ceph_destroy_client(fsc->client);
  
-       kfree(client);
-       dout("destroy_client %p done\n", client);
+       kfree(fsc);
+       dout("destroy_fs_client %p done\n", fsc);
  }
  
  /*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
   */
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
  {
-       if (client->have_fsid) {
-               if (ceph_fsid_compare(&client->fsid, fsid)) {
-                       pr_err("bad fsid, had %pU got %pU",
-                              &client->fsid, fsid);
-                       return -1;
-               }
-       } else {
-               pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-                       fsid);
-               memcpy(&client->fsid, fsid, sizeof(*fsid));
-               ceph_debugfs_client_init(client);
-               client->have_fsid = true;
-       }
+       struct ceph_inode_info *ci = foo;
+       inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+       ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+                                     sizeof(struct ceph_inode_info),
+                                     __alignof__(struct ceph_inode_info),
+                                     (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                     ceph_inode_init_once);
+       if (ceph_inode_cachep == NULL)
+               return -ENOMEM;
+
+       ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+                                    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_cap_cachep == NULL)
+               goto bad_cap;
+
+       ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+                                       SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_dentry_cachep == NULL)
+               goto bad_dentry;
+
+       ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+                                     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+       if (ceph_file_cachep == NULL)
+               goto bad_file;
+
         return 0;
+
+bad_file:
+       kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+       kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+       kmem_cache_destroy(ceph_inode_cachep);
+       return -ENOMEM;
  }
  
+static void destroy_caches(void)
+{
+       kmem_cache_destroy(ceph_inode_cachep);
+       kmem_cache_destroy(ceph_cap_cachep);
+       kmem_cache_destroy(ceph_dentry_cachep);
+       kmem_cache_destroy(ceph_file_cachep);
+}
+
+
  /*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
   */
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
  {
-       return client->monc.monmap && client->monc.monmap->epoch &&
-              client->osdc.osdmap && client->osdc.osdmap->epoch;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+       dout("ceph_umount_begin - starting forced umount\n");
+       if (!fsc)
+               return;
+       fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+       return;
  }
  
+static const struct super_operations ceph_super_ops = {
+       .alloc_inode    = ceph_alloc_inode,
+       .destroy_inode  = ceph_destroy_inode,
+       .write_inode    = ceph_write_inode,
+       .sync_fs        = ceph_sync_fs,
+       .put_super      = ceph_put_super,
+       .show_options   = ceph_show_options,
+       .statfs         = ceph_statfs,
+       .umount_begin   = ceph_umount_begin,
+};
+
  /*
   * Bootstrap mount by opening the root directory.  Note the mount
   * @started time from caller, and time out if this takes too long.
   */
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                        const char *path,
                                        unsigned long started)
  {
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct ceph_mds_request *req = NULL;
         int err;
         struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
         req->r_ino1.ino = CEPH_INO_ROOT;
         req->r_ino1.snap = CEPH_NOSNAP;
         req->r_started = started;
-       req->r_timeout = client->mount_args->mount_timeout * HZ;
+       req->r_timeout = fsc->client->options->mount_timeout * HZ;
         req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
         req->r_num_caps = 2;
         err = ceph_mdsc_do_request(mdsc, NULL, req);
         if (err == 0) {
                 dout("open_root_inode success\n");
                 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-                   client->sb->s_root == NULL)
+                   fsc->sb->s_root == NULL)
                         root = d_alloc_root(req->r_target_inode);
                 else
                         root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
         return root;
  }
  
+
+
+
  /*
   * mount: join the ceph cluster, and open root directory.
   */
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
                       const char *path)
  {
-       struct ceph_entity_addr *myaddr = NULL;
         int err;
-       unsigned long timeout = client->mount_args->mount_timeout * HZ;
         unsigned long started = jiffies;  /* note the start time */
         struct dentry *root;
+       int first = 0;   /* first vfsmount for this super_block */
  
         dout("mount start\n");
-       mutex_lock(&client->mount_mutex);
-
-       /* initialize the messenger */
-       if (client->msgr == NULL) {
-               if (ceph_test_opt(client, MYIP))
-                       myaddr = &client->mount_args->my_addr;
-               client->msgr = ceph_messenger_create(myaddr);
-               if (IS_ERR(client->msgr)) {
-                       err = PTR_ERR(client->msgr);
-                       client->msgr = NULL;
-                       goto out;
-               }
-               client->msgr->nocrc = ceph_test_opt(client, NOCRC);
-       }
+       mutex_lock(&fsc->client->mount_mutex);
  
-       /* open session, and wait for mon, mds, and osd maps */
-       err = ceph_monc_open_session(&client->monc);
+       err = __ceph_open_session(fsc->client, started);
         if (err < 0)
                 goto out;
  
-       while (!have_mon_and_osd_map(client)) {
-               err = -EIO;
-               if (timeout && time_after_eq(jiffies, started + timeout))
-                       goto out;
-
-               /* wait */
-               dout("mount waiting for mon_map\n");
-               err = wait_event_interruptible_timeout(client->auth_wq,
-                      have_mon_and_osd_map(client) || (client->auth_err < 0),
-                      timeout);
-               if (err == -EINTR || err == -ERESTARTSYS)
-                       goto out;
-               if (client->auth_err < 0) {
-                       err = client->auth_err;
-                       goto out;
-               }
-       }
-
         dout("mount opening root\n");
-       root = open_root_dentry(client, "", started);
+       root = open_root_dentry(fsc, "", started);
         if (IS_ERR(root)) {
                 err = PTR_ERR(root);
                 goto out;
         }
-       if (client->sb->s_root)
+       if (fsc->sb->s_root) {
                 dput(root);
-       else
-               client->sb->s_root = root;
+       } else {
+               fsc->sb->s_root = root;
+               first = 1;
+
+               err = ceph_fs_debugfs_init(fsc);
+               if (err < 0)
+                       goto fail;
+       }
  
         if (path[0] == 0) {
                 dget(root);
         } else {
                 dout("mount opening base mountpoint\n");
-               root = open_root_dentry(client, path, started);
+               root = open_root_dentry(fsc, path, started);
                 if (IS_ERR(root)) {
                         err = PTR_ERR(root);
-                       dput(client->sb->s_root);
-                       client->sb->s_root = NULL;
-                       goto out;
+                       goto fail;
                 }
         }
  
         mnt->mnt_root = root;
-       mnt->mnt_sb = client->sb;
+       mnt->mnt_sb = fsc->sb;
  
-       client->mount_state = CEPH_MOUNT_MOUNTED;
+       fsc->mount_state = CEPH_MOUNT_MOUNTED;
         dout("mount success\n");
         err = 0;
  
  out:
-       mutex_unlock(&client->mount_mutex);
+       mutex_unlock(&fsc->client->mount_mutex);
         return err;
+
+fail:
+       if (first) {
+               dput(fsc->sb->s_root);
+               fsc->sb->s_root = NULL;
+       }
+       goto out;
  }
  
  static int ceph_set_super(struct super_block *s, void *data)
  {
-       struct ceph_client *client = data;
+       struct ceph_fs_client *fsc = data;
         int ret;
  
         dout("set_super %p data %p\n", s, data);
  
-       s->s_flags = client->mount_args->sb_flags;
+       s->s_flags = fsc->mount_options->sb_flags;
         s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
  
-       s->s_fs_info = client;
-       client->sb = s;
+       s->s_fs_info = fsc;
+       fsc->sb = s;
  
         s->s_op = &ceph_super_ops;
         s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
  
  fail:
         s->s_fs_info = NULL;
-       client->sb = NULL;
+       fsc->sb = NULL;
         return ret;
  }
  
@@ -926,30 +732,23 @@ fail:
   */
  static int ceph_compare_super(struct super_block *sb, void *data)
  {
-       struct ceph_client *new = data;
-       struct ceph_mount_args *args = new->mount_args;
-       struct ceph_client *other = ceph_sb_to_client(sb);
-       int i;
+       struct ceph_fs_client *new = data;
+       struct ceph_mount_options *fsopt = new->mount_options;
+       struct ceph_options *opt = new->client->options;
+       struct ceph_fs_client *other = ceph_sb_to_client(sb);
  
         dout("ceph_compare_super %p\n", sb);
-       if (args->flags & CEPH_OPT_FSID) {
-               if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
-                       dout("fsid doesn't match\n");
-                       return 0;
-               }
-       } else {
-               /* do we share (a) monitor? */
-               for (i = 0; i < new->monc.monmap->num_mon; i++)
-                       if (ceph_monmap_contains(other->monc.monmap,
-                                        &new->monc.monmap->mon_inst[i].addr))
-                               break;
-               if (i == new->monc.monmap->num_mon) {
-                       dout("mon ip not part of monmap\n");
-                       return 0;
-               }
-               dout("mon ip matches existing sb %p\n", sb);
+
+       if (compare_mount_options(fsopt, opt, other)) {
+               dout("monitor(s)/mount options don't match\n");
+               return 0;
         }
-       if (args->sb_flags != other->mount_args->sb_flags) {
+       if ((opt->flags & CEPH_OPT_FSID) &&
+           ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+               dout("fsid doesn't match\n");
+               return 0;
+       }
+       if (fsopt->sb_flags != other->mount_options->sb_flags) {
                 dout("flags differ\n");
                 return 0;
         }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
   */
  static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+                            struct ceph_fs_client *fsc)
  {
         int err;
  
         /* set ra_pages based on rsize mount option? */
-       if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
-               client->backing_dev_info.ra_pages =
-                       (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+       if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+               fsc->backing_dev_info.ra_pages =
+                       (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
                         >> PAGE_SHIFT;
-       err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+       err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
                            atomic_long_inc_return(&bdi_seq));
         if (!err)
-               sb->s_bdi = &client->backing_dev_info;
+               sb->s_bdi = &fsc->backing_dev_info;
         return err;
  }
  
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                        struct vfsmount *mnt)
  {
         struct super_block *sb;
-       struct ceph_client *client;
+       struct ceph_fs_client *fsc;
         int err;
         int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
         const char *path = NULL;
-       struct ceph_mount_args *args;
+       struct ceph_mount_options *fsopt = NULL;
+       struct ceph_options *opt = NULL;
  
         dout("ceph_get_sb\n");
-       args = parse_mount_args(flags, data, dev_name, &path);
-       if (IS_ERR(args)) {
-               err = PTR_ERR(args);
+       err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+       if (err < 0)
                 goto out_final;
-       }
  
         /* create client (which we may/may not use) */
-       client = ceph_create_client(args);
-       if (IS_ERR(client)) {
-               err = PTR_ERR(client);
+       fsc = create_fs_client(fsopt, opt);
+       if (IS_ERR(fsc)) {
+               err = PTR_ERR(fsc);
+               kfree(fsopt);
+               kfree(opt);
                 goto out_final;
         }
  
-       if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+       err = ceph_mdsc_init(fsc);
+       if (err < 0)
+               goto out;
+
+       if (ceph_test_opt(fsc->client, NOSHARE))
                 compare_super = NULL;
-       sb = sget(fs_type, compare_super, ceph_set_super, client);
+       sb = sget(fs_type, compare_super, ceph_set_super, fsc);
         if (IS_ERR(sb)) {
                 err = PTR_ERR(sb);
                 goto out;
         }
  
-       if (ceph_sb_to_client(sb) != client) {
-               ceph_destroy_client(client);
-               client = ceph_sb_to_client(sb);
-               dout("get_sb got existing client %p\n", client);
+       if (ceph_sb_to_client(sb) != fsc) {
+               ceph_mdsc_destroy(fsc);
+               destroy_fs_client(fsc);
+               fsc = ceph_sb_to_client(sb);
+               dout("get_sb got existing client %p\n", fsc);
         } else {
-               dout("get_sb using new client %p\n", client);
-               err = ceph_register_bdi(sb, client);
+               dout("get_sb using new client %p\n", fsc);
+               err = ceph_register_bdi(sb, fsc);
                 if (err < 0)
                         goto out_splat;
         }
  
-       err = ceph_mount(client, mnt, path);
+       err = ceph_mount(fsc, mnt, path);
         if (err < 0)
                 goto out_splat;
         dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
         return 0;
  
  out_splat:
-       ceph_mdsc_close_sessions(&client->mdsc);
+       ceph_mdsc_close_sessions(fsc->mdsc);
         deactivate_locked_super(sb);
         goto out_final;
  
  out:
-       ceph_destroy_client(client);
+       ceph_mdsc_destroy(fsc);
+       destroy_fs_client(fsc);
  out_final:
         dout("ceph_get_sb fail %d\n", err);
         return err;
@@ -1042,11 +849,12 @@ out_final:
  
  static void ceph_kill_sb(struct super_block *s)
  {
-       struct ceph_client *client = ceph_sb_to_client(s);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(s);
         dout("kill_sb %p\n", s);
-       ceph_mdsc_pre_umount(&client->mdsc);
+       ceph_mdsc_pre_umount(fsc->mdsc);
         kill_anon_super(s);    /* will call put_super after sb is r/o */
-       ceph_destroy_client(client);
+       ceph_mdsc_destroy(fsc);
+       destroy_fs_client(fsc);
  }
  
  static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
  
  static int __init init_ceph(void)
  {
-       int ret = 0;
-
-       ret = ceph_debugfs_init();
-       if (ret < 0)
-               goto out;
-
-       ret = ceph_msgr_init();
-       if (ret < 0)
-               goto out_debugfs;
-
-       ret = init_caches();
+       int ret = init_caches();
         if (ret)
-               goto out_msgr;
+               goto out;
  
         ret = register_filesystem(&ceph_fs_type);
         if (ret)
                 goto out_icache;
  
-       pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-               CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
-               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+       pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
         return 0;
  
  out_icache:
         destroy_caches();
-out_msgr:
-       ceph_msgr_exit();
-out_debugfs:
-       ceph_debugfs_cleanup();
  out:
         return ret;
  }
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
         dout("exit_ceph\n");
         unregister_filesystem(&ceph_fs_type);
         destroy_caches();
-       ceph_msgr_exit();
-       ceph_debugfs_cleanup();
  }
  
  module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h

index b87638e..1886294 100644 (file)
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
  #ifndef _FS_CEPH_SUPER_H
  #define _FS_CEPH_SUPER_H
  
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
  
  #include <asm/unaligned.h>
  #include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
  #include <linux/writeback.h>
  #include <linux/slab.h>
  
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
  
  /* f_type in struct statfs */
  #define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
  #define CEPH_BLOCK_SHIFT   20  /* 1 MB */
  #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
  
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
  
-/*
- * mount options
- */
-#define CEPH_OPT_FSID             (1<<0)
-#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
  
-#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+       (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+       (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
  
-#define ceph_set_opt(client, opt) \
-       (client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
-       (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT        1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
  
-
-struct ceph_mount_args {
-       int sb_flags;
+struct ceph_mount_options {
         int flags;
-       struct ceph_fsid fsid;
-       struct ceph_entity_addr my_addr;
-       int num_mon;
-       struct ceph_entity_addr *mon_addr;
-       int mount_timeout;
-       int osd_idle_ttl;
-       int osd_timeout;
-       int osd_keepalive_timeout;
+       int sb_flags;
+
         int wsize;
         int rsize;            /* max readahead */
         int congestion_kb;    /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
         int cap_release_safety;
         int max_readdir;       /* max readdir result (entires) */
         int max_readdir_bytes; /* max readdir result (bytes) */
-       char *snapdir_name;   /* default ".snap" */
-       char *name;
-       char *secret;
-};
  
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT    1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN  (16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT   "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file.  Delay a minimum amount of time, even if we send a cap
- * message for some other reason.  Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
-       CEPH_MOUNT_MOUNTING,
-       CEPH_MOUNT_MOUNTED,
-       CEPH_MOUNT_UNMOUNTING,
-       CEPH_MOUNT_UNMOUNTED,
-       CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-       BUG_ON(time_after(b, a));
-       return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
-       struct ceph_fsid fsid;
-       bool have_fsid;
+       /*
+        * everything above this point can be memcmp'd; everything below
+        * is handled in compare_mount_options()
+        */
  
-       struct mutex mount_mutex;       /* serialize mount attempts */
-       struct ceph_mount_args *mount_args;
+       char *snapdir_name;   /* default ".snap" */
+};
  
+struct ceph_fs_client {
         struct super_block *sb;
  
-       unsigned long mount_state;
-       wait_queue_head_t auth_wq;
-
-       int auth_err;
+       struct ceph_mount_options *mount_options;
+       struct ceph_client *client;
  
+       unsigned long mount_state;
         int min_caps;                  /* min caps i added */
  
-       struct ceph_messenger *msgr;   /* messenger instance */
-       struct ceph_mon_client monc;
-       struct ceph_mds_client mdsc;
-       struct ceph_osd_client osdc;
+       struct ceph_mds_client *mdsc;
  
         /* writeback */
         mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
         struct backing_dev_info backing_dev_info;
  
  #ifdef CONFIG_DEBUG_FS
-       struct dentry *debugfs_monmap;
-       struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-       struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+       struct dentry *debugfs_dentry_lru, *debugfs_caps;
         struct dentry *debugfs_congestion_kb;
         struct dentry *debugfs_bdi;
+       struct dentry *debugfs_mdsc, *debugfs_mdsmap;
  #endif
  };
  
+
  /*
   * File i/o capability.  This tracks shared state with the metadata
   * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
         int should_free_val;
  };
  
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+       struct ceph_mds_session *lease_session;
+       u32 lease_gen, lease_shared_gen;
+       u32 lease_seq;
+       unsigned long lease_renew_after, lease_renew_from;
+       struct list_head lru;
+       struct dentry *dentry;
+       u64 time;
+       u64 offset;
+};
+
  struct ceph_inode_xattrs_info {
         /*
          * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
  /*
   * Ceph inode.
   */
-#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
-#define CEPH_I_NODELAY   4  /* do not delay cap release */
-#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
-
  struct ceph_inode_info {
         struct ceph_vino i_vino;   /* ceph ino + snap */
  
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
         return container_of(inode, struct ceph_inode_info, vfs_inode);
  }
  
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+       ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+       ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+       if (!ino)
+               ino = 1;
+#endif
+       return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+       return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+       struct ceph_vino *pvino = (struct ceph_vino *)data;
+       struct ceph_inode_info *ci = ceph_inode(inode);
+       return ci->i_vino.ino == pvino->ino &&
+               ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+                                           struct ceph_vino vino)
+{
+       ino_t t = ceph_vino_to_ino(vino);
+       return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
  static inline void ceph_i_clear(struct inode *inode, unsigned mask)
  {
         struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
         struct ceph_inode_info *ci = ceph_inode(inode);
         bool r;
  
-       smp_mb();
+       spin_lock(&inode->i_lock);
         r = (ci->i_ceph_flags & mask) == mask;
+       spin_unlock(&inode->i_lock);
         return r;
  }
  
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
                             struct ceph_inode_frag *pfrag,
                             int *found);
  
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
-       struct ceph_mds_session *lease_session;
-       u32 lease_gen, lease_shared_gen;
-       u32 lease_seq;
-       unsigned long lease_renew_after, lease_renew_from;
-       struct list_head lru;
-       struct dentry *dentry;
-       u64 time;
-       u64 offset;
-};
-
  static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
  {
         return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
         return ((loff_t)frag << 32) | (loff_t)off;
  }
  
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
-       ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
-       ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-       if (!ino)
-               ino = 1;
-#endif
-       return ino;
-}
-
  static inline int ceph_set_ino_cb(struct inode *inode, void *data)
  {
         ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
         return 0;
  }
  
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
-       return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
-       struct ceph_vino *pvino = (struct ceph_vino *)data;
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       return ci->i_vino.ino == pvino->ino &&
-               ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
-                                           struct ceph_vino vino)
-{
-       ino_t t = ceph_vino_to_ino(vino);
-       return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
  /*
   * caps helpers
   */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
                              struct ceph_cap_reservation *ctx, int need);
  extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
                                struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
                                     int *total, int *avail, int *used,
                                     int *reserved, int *min);
  
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
  {
-       return (struct ceph_client *)inode->i_sb->s_fs_info;
+       return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
  }
  
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
  {
-       return (struct ceph_client *)sb->s_fs_info;
+       return (struct ceph_fs_client *)sb->s_fs_info;
  }
  
  
@@ -616,51 +540,6 @@ struct ceph_file_info {
  
  
  
-/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data.  It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
-       atomic_t nref;
-       u64 seq;
-       int num_snaps;
-       u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-       /*
-       printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-              atomic_read(&sc->nref)+1);
-       */
-       if (sc)
-               atomic_inc(&sc->nref);
-       return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-       if (!sc)
-               return;
-       /*
-       printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-              atomic_read(&sc->nref)-1);
-       */
-       if (atomic_dec_and_test(&sc->nref)) {
-               /*printk(" deleting snap_context %p\n", sc);*/
-               kfree(sc);
-       }
-}
-
  /*
   * A "snap realm" describes a subset of the file hierarchy sharing
   * the same set of snapshots that apply to it.  The realms themselves
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
         spinlock_t inodes_with_caps_lock;
  };
  
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
  {
-       return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-               (off >> PAGE_CACHE_SHIFT);
+       int congestion_kb;
+
+       /*
+        * Copied from NFS
+        *
+        * congestion size, scale with available memory.
+        *
+        *  64MB:    8192k
+        * 128MB:   11585k
+        * 256MB:   16384k
+        * 512MB:   23170k
+        *   1GB:   32768k
+        *   2GB:   46340k
+        *   4GB:   65536k
+        *   8GB:   92681k
+        *  16GB:  131072k
+        *
+        * This allows larger machines to have larger/more transfers.
+        * Limit the default to 256M
+        */
+       congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+       if (congestion_kb > 256*1024)
+               congestion_kb = 256*1024;
+
+       return congestion_kb;
  }
  
  
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
                            ci_item)->writing;
  }
  
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
  /* inode.c */
  extern const struct inode_operations ceph_file_iops;
  
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
  /* file.c */
  extern const struct file_operations ceph_file_fops;
  extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
  extern int ceph_open(struct inode *inode, struct file *file);
  extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                        struct nameidata *nd, int mode,
                                        int locked_dir);
  extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
  
  /* dir.c */
  extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
  /* export.c */
  extern const struct export_operations ceph_export_ops;
  
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
  /* locks.c */
  extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
  extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
         return NULL;
  }
  
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
  #endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h

deleted file mode 100644 (file)

index 28b35a0..0000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
-       u64 ino;
-       u64 snap;
-};
-
-
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
-       int count;
-};
-
-
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c

index 9578af6..6e12a6b 100644 (file)
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
  #include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
  
  #include <linux/xattr.h>
  #include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
  static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                               const char *value, size_t size, int flags)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
         struct inode *inode = dentry->d_inode;
         struct ceph_inode_info *ci = ceph_inode(inode);
         struct inode *parent_inode = dentry->d_parent->d_inode;
         struct ceph_mds_request *req;
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         int err;
         int i, nr_pages;
         struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
  
         /* preallocate memory for xattr name, value, index node */
         err = -ENOMEM;
-       newname = kmalloc(name_len + 1, GFP_NOFS);
+       newname = kmemdup(name, name_len + 1, GFP_NOFS);
         if (!newname)
                 goto out;
-       memcpy(newname, name, name_len + 1);
  
         if (val_len) {
                 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
  
  static int ceph_send_removexattr(struct dentry *dentry, const char *name)
  {
-       struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
-       struct ceph_mds_client *mdsc = &client->mdsc;
+       struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+       struct ceph_mds_client *mdsc = fsc->mdsc;
         struct inode *inode = dentry->d_inode;
         struct inode *parent_inode = dentry->d_parent->d_inode;
         struct ceph_mds_request *req;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig

index cc96655..c465ae0 100644 (file)
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
  config GFS2_FS
         tristate "GFS2 file system support"
-       depends on EXPERIMENTAL && (64BIT || LBDAF)
+       depends on (64BIT || LBDAF)
         select DLM if GFS2_FS_LOCKING_DLM
         select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
         select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c

index 194fe16..6b24afb 100644 (file)
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
  #include "glops.h"
  
  
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-                                  unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                           unsigned int from, unsigned int to)
  {
         struct buffer_head *head = page_buffers(page);
         unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
         unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
         int alloc_required;
         int error = 0;
-       struct gfs2_alloc *al;
+       struct gfs2_alloc *al = NULL;
         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
         unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                 rblocks += RES_STATFS + RES_QUOTA;
         if (&ip->i_inode == sdp->sd_rindex)
                 rblocks += 2 * RES_STATFS;
+       if (alloc_required)
+               rblocks += gfs2_rg_blocks(al);
  
         error = gfs2_trans_begin(sdp, rblocks,
                                  PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
  
         page_cache_release(page);
  
-       /*
-        * XXX(truncate): the call below should probably be replaced with
-        * a call to the gfs2-specific truncate blocks helper to actually
-        * release disk blocks..
-        */
+       gfs2_trans_end(sdp);
         if (pos + len > ip->i_inode.i_size)
-               truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+               gfs2_trim_blocks(&ip->i_inode);
+       goto out_trans_fail;
+
  out_endtrans:
         gfs2_trans_end(sdp);
  out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
         page_cache_release(page);
  
         if (copied) {
-               if (inode->i_size < to) {
+               if (inode->i_size < to)
                         i_size_write(inode, to);
-                       ip->i_disksize = inode->i_size;
-               }
                 gfs2_dinode_out(ip, di);
                 mark_inode_dirty(inode);
         }
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
  
         ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
         if (ret > 0) {
-               if (inode->i_size > ip->i_disksize)
-                       ip->i_disksize = inode->i_size;
                 gfs2_dinode_out(ip, dibh->b_data);
                 mark_inode_dirty(inode);
         }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c

index 6f48280..5476c06 100644 (file)
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
   * @ip: the inode
   * @dibh: the dinode buffer
   * @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
   *
   * Returns: errno
   */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  /**
   * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
   * @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
   *
   * This routine unstuffs a dinode and returns it to a "normal" state such
   * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
         if (error)
                 goto out;
  
-       if (ip->i_disksize) {
+       if (i_size_read(&ip->i_inode)) {
                 /* Get a free block, fill it with the stuffed data,
                    and write it out to disk */
  
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
         di = (struct gfs2_dinode *)dibh->b_data;
         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
  
-       if (ip->i_disksize) {
+       if (i_size_read(&ip->i_inode)) {
                 *(__be64 *)(di + 1) = cpu_to_be64(block);
                 gfs2_add_inode_blocks(&ip->i_inode, 1);
                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -884,84 +883,15 @@ out:
         return error;
  }
  
-/**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-       struct gfs2_alloc *al;
-       struct buffer_head *dibh;
-       int error;
-
-       al = gfs2_alloc_get(ip);
-       if (!al)
-               return -ENOMEM;
-
-       error = gfs2_quota_lock_check(ip);
-       if (error)
-               goto out;
-
-       al->al_requested = sdp->sd_max_height + RES_DATA;
-
-       error = gfs2_inplace_reserve(ip);
-       if (error)
-               goto out_gunlock_q;
-
-       error = gfs2_trans_begin(sdp,
-                       sdp->sd_max_height + al->al_rgd->rd_length +
-                       RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
-       if (error)
-               goto out_ipres;
-
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto out_end_trans;
-
-       if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-               if (gfs2_is_stuffed(ip)) {
-                       error = gfs2_unstuff_dinode(ip, NULL);
-                       if (error)
-                               goto out_brelse;
-               }
-       }
-
-       ip->i_disksize = size;
-       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
-       brelse(dibh);
-out_end_trans:
-       gfs2_trans_end(sdp);
-out_ipres:
-       gfs2_inplace_release(ip);
-out_gunlock_q:
-       gfs2_quota_unlock(ip);
-out:
-       gfs2_alloc_put(ip);
-       return error;
-}
-
-
  /**
   * gfs2_block_truncate_page - Deal with zeroing out data for truncate
   *
   * This is partly borrowed from ext3.
   */
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
  {
         struct inode *inode = mapping->host;
         struct gfs2_inode *ip = GFS2_I(inode);
-       loff_t from = inode->i_size;
         unsigned long index = from >> PAGE_CACHE_SHIFT;
         unsigned offset = from & (PAGE_CACHE_SIZE-1);
         unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
         return err;
  }
  
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
  {
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       struct address_space *mapping = inode->i_mapping;
         struct buffer_head *dibh;
         int journaled = gfs2_is_jdata(ip);
         int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
         if (error)
                 goto out;
  
+       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
         if (gfs2_is_stuffed(ip)) {
-               u64 dsize = size + sizeof(struct gfs2_dinode);
-               ip->i_disksize = size;
-               ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-               gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-               gfs2_dinode_out(ip, dibh->b_data);
-               if (dsize > dibh->b_size)
-                       dsize = dibh->b_size;
-               gfs2_buffer_clear_tail(dibh, dsize);
-               error = 1;
+               gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
         } else {
-               if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
-                       error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
-               if (!error) {
-                       ip->i_disksize = size;
-                       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                       ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
-                       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-                       gfs2_dinode_out(ip, dibh->b_data);
+               if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+                       error = gfs2_block_truncate_page(mapping, newsize);
+                       if (error)
+                               goto out_brelse;
                 }
+               ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
         }
  
-       brelse(dibh);
+       i_size_write(inode, newsize);
+       ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+       gfs2_dinode_out(ip, dibh->b_data);
  
+       truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+       brelse(dibh);
  out:
         gfs2_trans_end(sdp);
         return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
         if (error)
                 goto out;
  
-       if (!ip->i_disksize) {
+       if (!i_size_read(&ip->i_inode)) {
                 ip->i_height = 0;
                 ip->i_goal = ip->i_no_addr;
                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
  
  /**
   * do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
   *
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
   *
   * Returns: errno
   */
  
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
  {
+       struct gfs2_inode *ip = GFS2_I(inode);
         int error;
  
-       error = trunc_start(ip, size);
+       error = trunc_start(inode, oldsize, newsize);
         if (error < 0)
                 return error;
-       if (error > 0)
+       if (gfs2_is_stuffed(ip))
                 return 0;
  
-       error = trunc_dealloc(ip, size);
-       if (!error)
+       error = trunc_dealloc(ip, newsize);
+       if (error == 0)
                 error = trunc_end(ip);
  
         return error;
  }
  
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
  {
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       u64 size = inode->i_size;
+       int ret;
+
+       ret = do_shrink(inode, size, size);
+       WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
         struct buffer_head *dibh;
+       struct gfs2_alloc *al = NULL;
         int error;
  
-       error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+       if (gfs2_is_stuffed(ip) &&
+           (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+               al = gfs2_alloc_get(ip);
+               if (al == NULL)
+                       return -ENOMEM;
+
+               error = gfs2_quota_lock_check(ip);
+               if (error)
+                       goto do_grow_alloc_put;
+
+               al->al_requested = 1;
+               error = gfs2_inplace_reserve(ip);
+               if (error)
+                       goto do_grow_qunlock;
+       }
+
+       error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
         if (error)
-               return error;
+               goto do_grow_release;
  
-       down_write(&ip->i_rw_mutex);
+       if (al) {
+               error = gfs2_unstuff_dinode(ip, NULL);
+               if (error)
+                       goto do_end_trans;
+       }
  
         error = gfs2_meta_inode_buffer(ip, &dibh);
         if (error)
-               goto do_touch_out;
+               goto do_end_trans;
  
+       i_size_write(inode, size);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         gfs2_dinode_out(ip, dibh->b_data);
         brelse(dibh);
  
-do_touch_out:
-       up_write(&ip->i_rw_mutex);
+do_end_trans:
         gfs2_trans_end(sdp);
+do_grow_release:
+       if (al) {
+               gfs2_inplace_release(ip);
+do_grow_qunlock:
+               gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+               gfs2_alloc_put(ip);
+       }
         return error;
  }
  
  /**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
   *
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
   *
   * Returns: errno
   */
  
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
  {
-       int error;
+       int ret;
+       u64 oldsize;
  
-       if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
-               return -EINVAL;
+       BUG_ON(!S_ISREG(inode->i_mode));
  
-       if (size > ip->i_disksize)
-               error = do_grow(ip, size);
-       else if (size < ip->i_disksize)
-               error = do_shrink(ip, size);
-       else
-               /* update time stamps */
-               error = do_touch(ip, size);
+       ret = inode_newsize_ok(inode, newsize);
+       if (ret)
+               return ret;
  
-       return error;
+       oldsize = inode->i_size;
+       if (newsize >= oldsize)
+               return do_grow(inode, newsize);
+
+       return do_shrink(inode, oldsize, newsize);
  }
  
  int gfs2_truncatei_resume(struct gfs2_inode *ip)
  {
         int error;
-       error = trunc_dealloc(ip, ip->i_disksize);
+       error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
         if (!error)
                 error = trunc_end(ip);
         return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
  
         shift = sdp->sd_sb.sb_bsize_shift;
         BUG_ON(gfs2_is_dir(ip));
-       end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+       end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
         lblock = offset >> shift;
         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
         if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h

index a20a521..42fea03 100644 (file)
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
         }
  }
  
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
-                             unsigned int len);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+                         struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+                          u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+                                    unsigned int len);
  
  #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c

index bb7907b..6798755 100644 (file)
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
                 ip = GFS2_I(inode);
         }
  
-       if (sdp->sd_args.ar_localcaching)
+       if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 goto valid;
  
         had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c

index b9dd88a..5c356d0 100644 (file)
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
  #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
  #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
  
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
  typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
                             u64 leaf_no, void *data);
  typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
  
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-       if (ip->i_disksize < offset + size)
-               ip->i_disksize = offset + size;
+       if (ip->i_inode.i_size < offset + size)
+               i_size_write(&ip->i_inode, offset + size);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
         gfs2_dinode_out(ip, dibh->b_data);
  
@@ -225,8 +228,8 @@ out:
         if (error)
                 return error;
  
-       if (ip->i_disksize < offset + copied)
-               ip->i_disksize = offset + copied;
+       if (ip->i_inode.i_size < offset + copied)
+               i_size_write(&ip->i_inode, offset + copied);
         ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
  
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
         unsigned int o;
         int copied = 0;
         int error = 0;
+       u64 disksize = i_size_read(&ip->i_inode);
  
-       if (offset >= ip->i_disksize)
+       if (offset >= disksize)
                 return 0;
  
-       if (offset + size > ip->i_disksize)
-               size = ip->i_disksize - offset;
+       if (offset + size > disksize)
+               size = disksize - offset;
  
         if (!size)
                 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
                 unsigned hsize = 1 << ip->i_depth;
                 unsigned index;
                 u64 ln;
-               if (hsize * sizeof(u64) != ip->i_disksize) {
+               if (hsize * sizeof(u64) != i_size_read(inode)) {
                         gfs2_consist_inode(ip);
                         return ERR_PTR(-EIO);
                 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
         for (x = sdp->sd_hash_ptrs; x--; lp++)
                 *lp = cpu_to_be64(bn);
  
-       dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+       i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
         gfs2_add_inode_blocks(&dip->i_inode, 1);
         dip->i_diskflags |= GFS2_DIF_EXHASH;
  
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
         u64 *buf;
         u64 *from, *to;
         u64 block;
+       u64 disksize = i_size_read(&dip->i_inode);
         int x;
         int error = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != disksize) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
         if (!buf)
                 return -ENOMEM;
  
-       for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+       for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
                 error = gfs2_dir_read_data(dip, (char *)buf,
                                             block * sdp->sd_hash_bsize,
                                             sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
         unsigned depth = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != i_size_read(inode)) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
         int error = 0;
  
         hsize = 1 << dip->i_depth;
-       if (hsize * sizeof(u64) != dip->i_disksize) {
+       if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
                 gfs2_consist_inode(dip);
                 return -EIO;
         }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h

index 4f91944..a98f644 100644 (file)
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
  struct gfs2_inode;
  struct gfs2_inum;
  
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
-                  const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-                const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
-                 filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
-                  const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+                                    const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+                         const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+                       const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+                        filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+                         const struct gfs2_inode *nip, unsigned int new_type);
  
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
  
-int gfs2_diradd_alloc_required(struct inode *dir,
-                              const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
-                           struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+                                     const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+                                  struct buffer_head **bhp);
  
  static inline u32 gfs2_disk_hash(const char *data, int len)
  {
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
         memcpy(dent + 1, name->name, name->len);
  }
  
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
  #endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c

index dfe237a..06d5827 100644 (file)
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
  
  static struct dentry *gfs2_get_parent(struct dentry *child)
  {
-       struct qstr dotdot;
         struct dentry *dentry;
  
-       /*
-        * XXX(hch): it would be a good idea to keep this around as a
-        *           static variable.
-        */
-       gfs2_str2qstr(&dotdot, "..");
-
-       dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+       dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
         if (!IS_ERR(dentry))
                 dentry->d_op = &gfs2_dops;
         return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index 4edd662..237ee6a 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
         rblocks = RES_DINODE + ind_blocks;
         if (gfs2_is_jdata(ip))
                 rblocks += data_blocks ? data_blocks : 1;
-       if (ind_blocks || data_blocks)
+       if (ind_blocks || data_blocks) {
                 rblocks += RES_STATFS + RES_QUOTA;
+               rblocks += gfs2_rg_blocks(al);
+       }
         ret = gfs2_trans_begin(sdp, rblocks, 0);
         if (ret)
                 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                         goto fail;
  
                 if (!(file->f_flags & O_LARGEFILE) &&
-                   ip->i_disksize > MAX_NON_LFS) {
+                   i_size_read(inode) > MAX_NON_LFS) {
                         error = -EOVERFLOW;
                         goto fail_gunlock;
                 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c

index 9adf8f9..8777885 100644 (file)
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                 else
                         gfs2_glock_put_nolock(gl);
         }
+       if (held1 && held2 && list_empty(&gl->gl_holders))
+               clear_bit(GLF_QUEUED, &gl->gl_flags);
  
         gl->gl_state = new_state;
         gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
                 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
                         insert_pt = &gh2->gh_list;
         }
+       set_bit(GLF_QUEUED, &gl->gl_flags);
         if (likely(insert_pt == NULL)) {
                 list_add_tail(&gh->gh_list, &gl->gl_holders);
                 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
  
         gfs2_glock_hold(gl);
         holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
-       if (time_before(now, holdtime))
-               delay = holdtime - now;
-       if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-               delay = gl->gl_ops->go_min_hold_time;
+       if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+               if (time_before(now, holdtime))
+                       delay = holdtime - now;
+               if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                       delay = gl->gl_ops->go_min_hold_time;
+       }
  
         spin_lock(&gl->gl_spin);
         handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
         spin_unlock(&lru_lock);
  
         spin_lock(&gl->gl_spin);
-       if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+       if (gl->gl_state != LM_ST_UNLOCKED)
                 handle_callback(gl, LM_ST_UNLOCKED, 0);
         spin_unlock(&gl->gl_spin);
         gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
                 *p++ = 'I';
         if (test_bit(GLF_FROZEN, gflags))
                 *p++ = 'F';
+       if (test_bit(GLF_QUEUED, gflags))
+               *p++ = 'q';
         *p = 0;
         return buf;
  }
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
         }
  #endif
  
-       glock_workqueue = create_workqueue("glock_workqueue");
+       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
         if (IS_ERR(glock_workqueue))
                 return PTR_ERR(glock_workqueue);
-       gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+                                               WQ_FREEZEABLE, 0);
         if (IS_ERR(gfs2_delete_workqueue)) {
                 destroy_workqueue(glock_workqueue);
                 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h

index 2bda191..db1c26d 100644 (file)
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
  
  /**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
   * @gl: the glock
   * @state: the state we're requesting
   * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c

index 49f97d3..0d149dc 100644 (file)
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
         const struct gfs2_inode *ip = gl->gl_object;
         if (ip == NULL)
                 return 0;
-       gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+       gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
                   (unsigned long long)ip->i_no_formal_ino,
                   (unsigned long long)ip->i_no_addr,
                   IF2DT(ip->i_inode.i_mode), ip->i_flags,
                   (unsigned int)ip->i_diskflags,
-                 (unsigned long long)ip->i_inode.i_size,
-                 (unsigned long long)ip->i_disksize);
+                 (unsigned long long)i_size_read(&ip->i_inode));
         return 0;
  }
  
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
         [LM_TYPE_META] = &gfs2_meta_glops,
         [LM_TYPE_INODE] = &gfs2_inode_glops,
         [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
-       [LM_TYPE_NONDISK] = &gfs2_trans_glops,
         [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
         [LM_TYPE_FLOCK] = &gfs2_flock_glops,
         [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h

index fdbf4b3..764fbb4 100644 (file)
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
         GLF_REPLY_PENDING               = 9,
         GLF_INITIAL                     = 10,
         GLF_FROZEN                      = 11,
+       GLF_QUEUED                      = 12,
  };
  
  struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
         u64 i_no_formal_ino;
         u64 i_generation;
         u64 i_eattr;
-       loff_t i_disksize;
         unsigned long i_flags;          /* GIF_... */
         struct gfs2_glock *i_gl; /* Move into i_gh? */
         struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
         char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
         char ar_hostdata[GFS2_LOCKNAME_LEN];    /* Host specific data */
         unsigned int ar_spectator:1;            /* Don't get a journal */
-       unsigned int ar_ignore_local_fs:1;      /* Ignore optimisations */
         unsigned int ar_localflocks:1;          /* Let the VFS do flock|fcntl */
-       unsigned int ar_localcaching:1;         /* Local caching */
         unsigned int ar_debug:1;                /* Oops on errors */
-       unsigned int ar_upgrade:1;              /* Upgrade ondisk format */
         unsigned int ar_posix_acl:1;            /* Enable posix acls */
         unsigned int ar_quota:2;                /* off/account/on */
         unsigned int ar_suiddir:1;              /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
   */
  
  struct lm_lockstruct {
-       unsigned int ls_jid;
+       int ls_jid;
         unsigned int ls_first;
         unsigned int ls_first_done;
         unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
         struct list_head sd_rindex_mru_list;
         struct gfs2_rgrpd *sd_rindex_forward;
         unsigned int sd_rgrps;
+       unsigned int sd_max_rg_data;
  
         /* Journal index stuff */
  
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index 08140f1..06370f8 100644 (file)
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
          * to do that.
          */
         ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-       ip->i_disksize = be64_to_cpu(str->di_size);
-       i_size_write(&ip->i_inode, ip->i_disksize);
+       i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
         gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
         atime.tv_sec = be64_to_cpu(str->di_atime);
         atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
         str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
         str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
         str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-       str->di_size = cpu_to_be64(ip->i_disksize);
+       str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
         str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
         str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
         str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
                (unsigned long long)ip->i_no_formal_ino);
         printk(KERN_INFO "  no_addr = %llu\n",
                (unsigned long long)ip->i_no_addr);
-       printk(KERN_INFO "  i_disksize = %llu\n",
-              (unsigned long long)ip->i_disksize);
+       printk(KERN_INFO "  i_size = %llu\n",
+              (unsigned long long)i_size_read(&ip->i_inode));
         printk(KERN_INFO "  blocks = %llu\n",
                (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
         printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h

index 300ada3..6720d7d 100644 (file)
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
  extern int gfs2_internal_read(struct gfs2_inode *ip,
                               struct file_ra_state *ra_state,
                               char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+                                  unsigned int from, unsigned int to);
  extern void gfs2_set_aops(struct inode *inode);
  
  static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
         dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
  }
  
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+                                               u64 minsize, u64 maxsize)
+{
+       u64 size = i_size_read(inode);
+       if (size < minsize || size > maxsize)
+               goto err;
+       if (size & ((1 << inode->i_blkbits) - 1))
+               goto err;
+       return 0;
+err:
+       gfs2_consist_inode(GFS2_I(inode));
+       return -EIO;
+}
  
  extern void gfs2_set_iop(struct inode *inode);
  extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c

index 0e0470e..1c09425 100644 (file)
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
                 ret |= LM_OUT_CANCELED;
                 goto out;
         case -EAGAIN: /* Try lock fails */
+       case -EDEADLK: /* Deadlock detected */
                 goto out;
-       case -EINVAL: /* Invalid */
-       case -ENOMEM: /* Out of memory */
+       case -ETIMEDOUT: /* Canceled due to timeout */
                 ret |= LM_OUT_ERROR;
                 goto out;
         case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c

index b1e9630..d7eb1e2 100644 (file)
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
  #include "glock.h"
  #include "quota.h"
  #include "recovery.h"
+#include "dir.h"
  
  static struct shrinker qd_shrinker = {
         .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
  {
         int error;
  
+       gfs2_str2qstr(&gfs2_qdot, ".");
+       gfs2_str2qstr(&gfs2_qdotdot, "..");
+
         error = gfs2_sys_init();
         if (error)
                 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
  
         error = -ENOMEM;
         gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                         WQ_NON_REENTRANT | WQ_RESCUER, 0);
+                                         WQ_RESCUER | WQ_FREEZEABLE, 0);
         if (!gfs_recovery_wq)
                 goto fail_wq;
  
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c

index 4d4b1e8..aeafc23 100644 (file)
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
  #define DO 0
  #define UNDO 1
  
-static const u32 gfs2_old_fs_formats[] = {
-        0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
-        0
-};
-
  /**
   * gfs2_tune_init - Fill a gfs2_tune structure with default values
   * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
  
  static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
  {
-       unsigned int x;
-
         if (sb->sb_magic != GFS2_MAGIC ||
             sb->sb_type != GFS2_METATYPE_SB) {
                 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
             sb->sb_multihost_format == GFS2_FORMAT_MULTI)
                 return 0;
  
-       if (sb->sb_fs_format != GFS2_FORMAT_FS) {
-               for (x = 0; gfs2_old_fs_formats[x]; x++)
-                       if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
-                               break;
+       fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
  
-               if (!gfs2_old_fs_formats[x]) {
-                       printk(KERN_WARNING
-                              "GFS2: code version (%u, %u) is incompatible "
-                              "with ondisk format (%u, %u)\n",
-                              GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                              sb->sb_fs_format, sb->sb_multihost_format);
-                       printk(KERN_WARNING
-                              "GFS2: I don't know how to upgrade this FS\n");
-                       return -EINVAL;
-               }
-       }
-
-       if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
-               for (x = 0; gfs2_old_multihost_formats[x]; x++)
-                       if (gfs2_old_multihost_formats[x] ==
-                           sb->sb_multihost_format)
-                               break;
-
-               if (!gfs2_old_multihost_formats[x]) {
-                       printk(KERN_WARNING
-                              "GFS2: code version (%u, %u) is incompatible "
-                              "with ondisk format (%u, %u)\n",
-                              GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                              sb->sb_fs_format, sb->sb_multihost_format);
-                       printk(KERN_WARNING
-                              "GFS2: I don't know how to upgrade this FS\n");
-                       return -EINVAL;
-               }
-       }
-
-       if (!sdp->sd_args.ar_upgrade) {
-               printk(KERN_WARNING
-                      "GFS2: code version (%u, %u) is incompatible "
-                      "with ondisk format (%u, %u)\n",
-                      GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
-                      sb->sb_fs_format, sb->sb_multihost_format);
-               printk(KERN_INFO
-                      "GFS2: Use the \"upgrade\" mount option to upgrade "
-                      "the FS\n");
-               printk(KERN_INFO "GFS2: See the manual for more details\n");
-               return -EINVAL;
-       }
-
-       return 0;
+       return -EINVAL;
  }
  
  static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
  
         prev_db = 0;
  
-       for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+       for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
                 bh.b_state = 0;
                 bh.b_blocknr = 0;
                 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
         if (!strcmp("lock_nolock", proto)) {
                 lm = &nolock_ops;
                 sdp->sd_args.ar_localflocks = 1;
-               sdp->sd_args.ar_localcaching = 1;
  #ifdef CONFIG_GFS2_FS_LOCKING_DLM
         } else if (!strcmp("lock_dlm", proto)) {
                 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
  
  static int wait_on_journal(struct gfs2_sbd *sdp)
  {
-       if (sdp->sd_args.ar_spectator)
-               return 0;
         if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 return 0;
  
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
         if (error)
                 goto fail_sb;
  
+       /*
+        * If user space has failed to join the cluster or some similar
+        * failure has occurred, then the journal id will contain a
+        * negative (error) number. This will then be returned to the
+        * caller (of the mount syscall). We do this even for spectator
+        * mounts (which just write a jid of 0 to indicate "ok" even though
+        * the jid is unused in the spectator case)
+        */
+       if (sdp->sd_lockstruct.ls_jid < 0) {
+               error = sdp->sd_lockstruct.ls_jid;
+               sdp->sd_lockstruct.ls_jid = 0;
+               goto fail_sb;
+       }
+
         error = init_inodes(sdp, DO);
         if (error)
                 goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c

index 1009be2..0534510 100644 (file)
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
  #include <linux/gfs2_ondisk.h>
  #include <linux/crc32.h>
  #include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
  #include <asm/uaccess.h>
  
  #include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                         goto out_gunlock_q;
  
                 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                        al->al_rgd->rd_length +
+                                        gfs2_rg_blocks(al) +
                                          2 * RES_DINODE + RES_STATFS +
                                          RES_QUOTA, 0);
                 if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
  
         ip = ghs[1].gh_gl->gl_object;
  
-       ip->i_disksize = size;
         i_size_write(inode, size);
  
         error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         ip = ghs[1].gh_gl->gl_object;
  
         ip->i_inode.i_nlink = 2;
-       ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+       i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
         ip->i_diskflags |= GFS2_DIF_JDATA;
         ip->i_entries = 2;
  
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         if (!gfs2_assert_withdraw(sdp, !error)) {
                 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
                 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-               struct qstr str;
  
-               gfs2_str2qstr(&str, ".");
                 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-               gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+               gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
                 dent->de_inum = di->di_num; /* already GFS2 endian */
                 dent->de_type = cpu_to_be16(DT_DIR);
                 di->di_entries = cpu_to_be32(1);
  
-               gfs2_str2qstr(&str, "..");
                 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
-               gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+               gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
  
                 gfs2_inum_out(dip, dent);
                 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
                        struct gfs2_inode *ip)
  {
-       struct qstr dotname;
         int error;
  
         if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
         if (error)
                 return error;
  
-       gfs2_str2qstr(&dotname, ".");
-       error = gfs2_dir_del(ip, &dotname);
+       error = gfs2_dir_del(ip, &gfs2_qdot);
         if (error)
                 return error;
  
-       gfs2_str2qstr(&dotname, "..");
-       error = gfs2_dir_del(ip, &dotname);
+       error = gfs2_dir_del(ip, &gfs2_qdotdot);
         if (error)
                 return error;
  
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
         struct inode *dir = &to->i_inode;
         struct super_block *sb = dir->i_sb;
         struct inode *tmp;
-       struct qstr dotdot;
         int error = 0;
  
-       gfs2_str2qstr(&dotdot, "..");
-
         igrab(dir);
  
         for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
                         break;
                 }
  
-               tmp = gfs2_lookupi(dir, &dotdot, 1);
+               tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
                 if (IS_ERR(tmp)) {
                         error = PTR_ERR(tmp);
                         break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
         struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
         struct gfs2_inode *nip = NULL;
         struct gfs2_sbd *sdp = GFS2_SB(odir);
-       struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+       struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
         struct gfs2_rgrpd *nrgd;
         unsigned int num_gh;
         int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                         return 0;
         }
  
+       error = gfs2_rindex_hold(sdp, &ri_gh);
+       if (error)
+               return error;
  
         if (odip != ndip) {
                 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
  
                 al->al_requested = sdp->sd_max_dirres;
  
-               error = gfs2_inplace_reserve(ndip);
+               error = gfs2_inplace_reserve_ri(ndip);
                 if (error)
                         goto out_gunlock_q;
  
                 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-                                        al->al_rgd->rd_length +
+                                        gfs2_rg_blocks(al) +
                                          4 * RES_DINODE + 4 * RES_LEAF +
                                          RES_STATFS + RES_QUOTA + 4, 0);
                 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
         }
  
         if (dir_rename) {
-               struct qstr name;
-               gfs2_str2qstr(&name, "..");
-
                 error = gfs2_change_nlink(ndip, +1);
                 if (error)
                         goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                 if (error)
                         goto out_end_trans;
  
-               error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+               error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
                 if (error)
                         goto out_end_trans;
         } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
         if (r_gh.gh_gl)
                 gfs2_glock_dq_uninit(&r_gh);
  out:
+       gfs2_glock_dq_uninit(&ri_gh);
         return error;
  }
  
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
         struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
         struct gfs2_holder i_gh;
         struct buffer_head *dibh;
-       unsigned int x;
+       unsigned int x, size;
         char *buf;
         int error;
  
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                 return NULL;
         }
  
-       if (!ip->i_disksize) {
+       size = (unsigned int)i_size_read(&ip->i_inode);
+       if (size == 0) {
                 gfs2_consist_inode(ip);
                 buf = ERR_PTR(-EIO);
                 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
                 goto out;
         }
  
-       x = ip->i_disksize + 1;
+       x = size + 1;
         buf = kmalloc(x, GFP_NOFS);
         if (!buf)
                 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
         return error;
  }
  
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
-       struct gfs2_inode *ip = GFS2_I(inode);
-       struct gfs2_sbd *sdp = GFS2_SB(inode);
-       int error;
-
-       if (attr->ia_size != ip->i_disksize) {
-               error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-               if (error)
-                       return error;
-               truncate_setsize(inode, attr->ia_size);
-               gfs2_trans_end(sdp);
-       }
-
-       error = gfs2_truncatei(ip, attr->ia_size);
-       if (error && (inode->i_size != ip->i_disksize))
-               i_size_write(inode, ip->i_disksize);
-
-       return error;
-}
-
  static int setattr_chown(struct inode *inode, struct iattr *attr)
  {
         struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
                 goto out;
  
         if (attr->ia_valid & ATTR_SIZE)
-               error = setattr_size(inode, attr);
+               error = gfs2_setattr_size(inode, attr->ia_size);
         else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
                 error = setattr_chown(inode, attr);
         else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
         return ret;
  }
  
+static void empty_write_end(struct page *page, unsigned from,
+                          unsigned to)
+{
+       struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+       page_zero_new_buffers(page, from, to);
+       flush_dcache_page(page);
+       mark_page_accessed(page);
+
+       if (!gfs2_is_writeback(ip))
+               gfs2_page_add_databufs(ip, page, from, to);
+
+       block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+       unsigned start, end, next;
+       struct buffer_head *bh, *head;
+       int error;
+
+       if (!page_has_buffers(page)) {
+               error = block_prepare_write(page, from, to, gfs2_block_map);
+               if (unlikely(error))
+                       return error;
+
+               empty_write_end(page, from, to);
+               return 0;
+       }
+
+       bh = head = page_buffers(page);
+       next = end = 0;
+       while (next < from) {
+               next += bh->b_size;
+               bh = bh->b_this_page;
+       }
+       start = next;
+       do {
+               next += bh->b_size;
+               if (buffer_mapped(bh)) {
+                       if (end) {
+                               error = block_prepare_write(page, start, end,
+                                                           gfs2_block_map);
+                               if (unlikely(error))
+                                       return error;
+                               empty_write_end(page, start, end);
+                               end = 0;
+                       }
+                       start = next;
+               }
+               else
+                       end = next;
+               bh = bh->b_this_page;
+       } while (next < to);
+
+       if (end) {
+               error = block_prepare_write(page, start, end, gfs2_block_map);
+               if (unlikely(error))
+                       return error;
+               empty_write_end(page, start, end);
+       }
+
+       return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+                          int mode)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct buffer_head *dibh;
+       int error;
+       u64 start = offset >> PAGE_CACHE_SHIFT;
+       unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+       u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+       pgoff_t curr;
+       struct page *page;
+       unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+       unsigned int from, to;
+
+       if (!end_offset)
+               end_offset = PAGE_CACHE_SIZE;
+
+       error = gfs2_meta_inode_buffer(ip, &dibh);
+       if (unlikely(error))
+               goto out;
+
+       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+       if (gfs2_is_stuffed(ip)) {
+               error = gfs2_unstuff_dinode(ip, NULL);
+               if (unlikely(error))
+                       goto out;
+       }
+
+       curr = start;
+       offset = start << PAGE_CACHE_SHIFT;
+       from = start_offset;
+       to = PAGE_CACHE_SIZE;
+       while (curr <= end) {
+               page = grab_cache_page_write_begin(inode->i_mapping, curr,
+                                                  AOP_FLAG_NOFS);
+               if (unlikely(!page)) {
+                       error = -ENOMEM;
+                       goto out;
+               }
+
+               if (curr == end)
+                       to = end_offset;
+               error = write_empty_blocks(page, from, to);
+               if (!error && offset + to > inode->i_size &&
+                   !(mode & FALLOC_FL_KEEP_SIZE)) {
+                       i_size_write(inode, offset + to);
+               }
+               unlock_page(page);
+               page_cache_release(page);
+               if (error)
+                       goto out;
+               curr++;
+               offset += PAGE_CACHE_SIZE;
+               from = 0;
+       }
+
+       gfs2_dinode_out(ip, dibh->b_data);
+       mark_inode_dirty(inode);
+
+       brelse(dibh);
+
+out:
+       return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+                           unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+       const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+       unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+       unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+       for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+               tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+               max_data -= tmp;
+       }
+       /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+          so it might end up with fewer data blocks */
+       if (max_data <= *data_blocks)
+               return;
+       *data_blocks = max_data;
+       *ind_blocks = max_blocks - max_data;
+       *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+       if (*len > max) {
+               *len = max;
+               gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+       }
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+                          loff_t len)
+{
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       struct gfs2_inode *ip = GFS2_I(inode);
+       unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+       loff_t bytes, max_bytes;
+       struct gfs2_alloc *al;
+       int error;
+       loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+       next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+       offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+                sdp->sd_sb.sb_bsize_shift;
+
+       len = next - offset;
+       bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+       if (!bytes)
+               bytes = UINT_MAX;
+
+       gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+       error = gfs2_glock_nq(&ip->i_gh);
+       if (unlikely(error))
+               goto out_uninit;
+
+       if (!gfs2_write_alloc_required(ip, offset, len))
+               goto out_unlock;
+
+       while (len > 0) {
+               if (len < bytes)
+                       bytes = len;
+               al = gfs2_alloc_get(ip);
+               if (!al) {
+                       error = -ENOMEM;
+                       goto out_unlock;
+               }
+
+               error = gfs2_quota_lock_check(ip);
+               if (error)
+                       goto out_alloc_put;
+
+retry:
+               gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+               al->al_requested = data_blocks + ind_blocks;
+               error = gfs2_inplace_reserve(ip);
+               if (error) {
+                       if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+                               bytes >>= 1;
+                               goto retry;
+                       }
+                       goto out_qunlock;
+               }
+               max_bytes = bytes;
+               calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+               al->al_requested = data_blocks + ind_blocks;
+
+               rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+                         RES_RG_HDR + gfs2_rg_blocks(al);
+               if (gfs2_is_jdata(ip))
+                       rblocks += data_blocks ? data_blocks : 1;
+
+               error = gfs2_trans_begin(sdp, rblocks,
+                                        PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+               if (error)
+                       goto out_trans_fail;
+
+               error = fallocate_chunk(inode, offset, max_bytes, mode);
+               gfs2_trans_end(sdp);
+
+               if (error)
+                       goto out_trans_fail;
+
+               len -= max_bytes;
+               offset += max_bytes;
+               gfs2_inplace_release(ip);
+               gfs2_quota_unlock(ip);
+               gfs2_alloc_put(ip);
+       }
+       goto out_unlock;
+
+out_trans_fail:
+       gfs2_inplace_release(ip);
+out_qunlock:
+       gfs2_quota_unlock(ip);
+out_alloc_put:
+       gfs2_alloc_put(ip);
+out_unlock:
+       gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+       gfs2_holder_uninit(&ip->i_gh);
+       return error;
+}
+
+
  static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        u64 start, u64 len)
  {
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
         .getxattr = gfs2_getxattr,
         .listxattr = gfs2_listxattr,
         .removexattr = gfs2_removexattr,
+       .fallocate = gfs2_fallocate,
         .fiemap = gfs2_fiemap,
  };
  
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c

index 1bc6b56..58a9b99 100644 (file)
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
                 goto out;
  
         size = loc + sizeof(struct gfs2_quota);
-       if (size > inode->i_size) {
-               ip->i_disksize = size;
+       if (size > inode->i_size)
                 i_size_write(inode, size);
-       }
         inode->i_mtime = inode->i_atime = CURRENT_TIME;
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                 goto out_alloc;
  
         if (nalloc)
-               blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+               blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
  
         error = gfs2_trans_begin(sdp, blocks, 0);
         if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
  int gfs2_quota_init(struct gfs2_sbd *sdp)
  {
         struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-       unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+       u64 size = i_size_read(sdp->sd_qc_inode);
+       unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
         unsigned int x, slot = 0;
         unsigned int found = 0;
         u64 dblock;
         u32 extlen = 0;
         int error;
  
-       if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-           ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
-               gfs2_consist_inode(ip);
+       if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
                 return -EIO;
-       }
+
         sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
         sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
  
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                 error = gfs2_inplace_reserve(ip);
                 if (error)
                         goto out_alloc;
+               blocks += gfs2_rg_blocks(al);
         }
  
         error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c

index f7f89a9..f2a02ed 100644 (file)
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
         int ro = 0;
         unsigned int pass;
         int error;
+       int jlocked = 0;
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+       if (sdp->sd_args.ar_spectator ||
+           (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
                 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
                         jd->jd_jid);
-
+               jlocked = 1;
                 /* Acquire the journal lock so we can do recovery */
  
                 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
                         jd->jd_jid, t);
         }
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
-               gfs2_glock_dq_uninit(&ji_gh);
-
         gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
  
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+       if (jlocked) {
+               gfs2_glock_dq_uninit(&ji_gh);
                 gfs2_glock_dq_uninit(&j_gh);
+       }
  
         fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
         goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
  fail_gunlock_tr:
         gfs2_glock_dq_uninit(&t_gh);
  fail_gunlock_ji:
-       if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+       if (jlocked) {
                 gfs2_glock_dq_uninit(&ji_gh);
  fail_gunlock_j:
                 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c

index 171a744..fb67f59 100644 (file)
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
         for (rgrps = 0;; rgrps++) {
                 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
  
-               if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+               if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
                         break;
                 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                            sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
-       u64 rgrp_count = ip->i_disksize;
+       u64 rgrp_count = i_size_read(inode);
+       struct gfs2_rgrpd *rgd;
+       unsigned int max_data = 0;
         int error;
  
         do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
                 }
         }
  
+       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+               if (rgd->rd_data > max_data)
+                       max_data = rgd->rd_data;
+       sdp->sd_max_rg_data = max_data;
         sdp->sd_rindex_uptodate = 1;
         return 0;
  }
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
         struct file_ra_state ra_state;
+       struct gfs2_rgrpd *rgd;
+       unsigned int max_data = 0;
         int error;
  
         file_ra_state_init(&ra_state, inode->i_mapping);
         for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                 /* Ignore partials */
                 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                   ip->i_disksize)
+                   i_size_read(inode))
                         break;
                 error = read_rindex_entry(ip, &ra_state);
                 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
                         return error;
                 }
         }
+       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+               if (rgd->rd_data > max_data)
+                       max_data = rgd->rd_data;
+       sdp->sd_max_rg_data = max_data;
  
         sdp->sd_rindex_uptodate = 1;
         return 0;
@@ -1188,7 +1200,8 @@ out:
   * Returns: errno
   */
  
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+                          char *file, unsigned int line)
  {
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
                 return -EINVAL;
  
  try_again:
-       /* We need to hold the rindex unless the inode we're using is
-          the rindex itself, in which case it's already held. */
-       if (ip != GFS2_I(sdp->sd_rindex))
-               error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
-       else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
-               error = gfs2_ri_update_special(ip);
+       if (hold_rindex) {
+               /* We need to hold the rindex unless the inode we're using is
+                  the rindex itself, in which case it's already held. */
+               if (ip != GFS2_I(sdp->sd_rindex))
+                       error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+               else if (!sdp->sd_rgrps) /* We may not have the rindex read
+                                           in, so: */
+                       error = gfs2_ri_update_special(ip);
+       }
  
         if (error)
                 return error;
@@ -1215,7 +1231,7 @@ try_again:
            try to free it, and try the allocation again. */
         error = get_local_rgrp(ip, &unlinked, &last_unlinked);
         if (error) {
-               if (ip != GFS2_I(sdp->sd_rindex))
+               if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
                         gfs2_glock_dq_uninit(&al->al_ri_gh);
                 if (error != -EAGAIN)
                         return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
         al->al_rgd = NULL;
         if (al->al_rgd_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_rgd_gh);
-       if (ip != GFS2_I(sdp->sd_rindex))
+       if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
                 gfs2_glock_dq_uninit(&al->al_ri_gh);
  }
  
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct buffer_head *dibh;
         struct gfs2_alloc *al = ip->i_alloc;
-       struct gfs2_rgrpd *rgd = al->al_rgd;
+       struct gfs2_rgrpd *rgd;
         u32 goal, blk;
         u64 block;
         int error;
  
+       /* Only happens if there is a bug in gfs2, return something distinctive
+        * to ensure that it is noticed.
+        */
+       if (al == NULL)
+               return -ECANCELED;
+
+       rgd = al->al_rgd;
+
         if (rgrp_contains_block(rgd, ip->i_goal))
                 goal = ip->i_goal - rgd->rd_data0;
         else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h

index f07119d..0e35c04 100644 (file)
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
         ip->i_alloc = NULL;
  }
  
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-                                 unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+                                 char *file, unsigned int line);
  #define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+       gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+       gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
  
  extern void gfs2_inplace_release(struct gfs2_inode *ip);
  
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c

index 77cb9f8..047d117 100644 (file)
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
         {Opt_locktable, "locktable=%s"},
         {Opt_hostdata, "hostdata=%s"},
         {Opt_spectator, "spectator"},
+       {Opt_spectator, "norecovery"},
         {Opt_ignore_local_fs, "ignore_local_fs"},
         {Opt_localflocks, "localflocks"},
         {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                         args->ar_spectator = 1;
                         break;
                 case Opt_ignore_local_fs:
-                       args->ar_ignore_local_fs = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_localflocks:
                         args->ar_localflocks = 1;
                         break;
                 case Opt_localcaching:
-                       args->ar_localcaching = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_debug:
                         if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                         args->ar_debug = 0;
                         break;
                 case Opt_upgrade:
-                       args->ar_upgrade = 1;
+                       /* Retained for backwards compat only */
                         break;
                 case Opt_acl:
                         args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
  {
         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
         struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+       u64 size = i_size_read(jd->jd_inode);
  
-       if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-           (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
-               gfs2_consist_inode(ip);
+       if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
                 return -EIO;
-       }
-       jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
  
-       if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+       jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+       if (gfs2_write_alloc_required(ip, 0, size)) {
                 gfs2_consist_inode(ip);
                 return -EIO;
         }
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  
         /* Some flags must not be changed */
         if (args_neq(&args, &sdp->sd_args, spectator) ||
-           args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
             args_neq(&args, &sdp->sd_args, localflocks) ||
-           args_neq(&args, &sdp->sd_args, localcaching) ||
             args_neq(&args, &sdp->sd_args, meta))
                 return -EINVAL;
  
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
         if (args->ar_spectator)
                 seq_printf(s, ",spectator");
-       if (args->ar_ignore_local_fs)
-               seq_printf(s, ",ignore_local_fs");
         if (args->ar_localflocks)
                 seq_printf(s, ",localflocks");
-       if (args->ar_localcaching)
-               seq_printf(s, ",localcaching");
         if (args->ar_debug)
                 seq_printf(s, ",debug");
-       if (args->ar_upgrade)
-               seq_printf(s, ",upgrade");
         if (args->ar_posix_acl)
                 seq_printf(s, ",acl");
         if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c

index ccacffd..748ccb5 100644 (file)
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
  
         if (gltype > LM_TYPE_JOURNAL)
                 return -EINVAL;
-       glops = gfs2_glops_list[gltype];
+       if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+               glops = &gfs2_trans_glops;
+       else
+               glops = gfs2_glops_list[gltype];
         if (glops == NULL)
                 return -EINVAL;
         if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
  
  static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
  {
-       return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+       return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
  }
  
  static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
  {
-        unsigned jid;
+        int jid;
         int rv;
  
-       rv = sscanf(buf, "%u", &jid);
+       rv = sscanf(buf, "%d", &jid);
         if (rv != 1)
                 return -EINVAL;
  
         spin_lock(&sdp->sd_jindex_spin);
         rv = -EINVAL;
-       if (sdp->sd_args.ar_spectator)
-               goto out;
         if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                 goto out;
         rv = -EBUSY;
-       if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+       if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
                 goto out;
+       rv = 0;
+       if (sdp->sd_args.ar_spectator && jid > 0)
+               rv = jid = -EINVAL;
         sdp->sd_lockstruct.ls_jid = jid;
+       clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
         smp_mb__after_clear_bit();
         wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
-       rv = 0;
  out:
         spin_unlock(&sdp->sd_jindex_spin);
         return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
         add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
         add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
         if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
-               add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+               add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
         if (gfs2_uuid_valid(uuid))
                 add_uevent_var(env, "UUID=%pUB", uuid);
         return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h

index 148d55c..cedb0bb 100644 (file)
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
         {(1UL << GLF_INVALIDATE_IN_PROGRESS),   "i" },          \
         {(1UL << GLF_REPLY_PENDING),            "r" },          \
         {(1UL << GLF_INITIAL),                  "I" },          \
-       {(1UL << GLF_FROZEN),                   "F" })
+       {(1UL << GLF_FROZEN),                   "F" },          \
+       {(1UL << GLF_QUEUED),                   "q" })
  
  #ifndef NUMPTY
  #define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h

index edf9d4b..fb56b78 100644 (file)
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
  #define RES_JDATA      1
  #define RES_DATA       1
  #define RES_LEAF       1
+#define RES_RG_HDR     1
  #define RES_RG_BIT     2
  #define RES_EATTR      1
  #define RES_STATFS     1
  #define RES_QUOTA      2
  
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+       return (al->al_requested < al->al_rgd->rd_length)?
+              al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
  int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                      unsigned int revokes);
  
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c

index 776af6e..30b58f0 100644 (file)
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                 goto out_gunlock_q;
  
         error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
-                                blks + al->al_rgd->rd_length +
+                                blks + gfs2_rg_blocks(al) +
                                  RES_DINODE + RES_STATFS + RES_QUOTA, 0);
         if (error)
                 goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c

index 4129cdb..571abe9 100644 (file)
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
         fd->search_key = ptr;
         fd->key = ptr + tree->max_key_len + 2;
         dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-       down(&tree->tree_lock);
+       mutex_lock(&tree->tree_lock);
         return 0;
  }
  
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
         hfs_bnode_put(fd->bnode);
         kfree(fd->search_key);
         dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-       up(&fd->tree->tree_lock);
+       mutex_unlock(&fd->tree->tree_lock);
         fd->tree = NULL;
  }
  
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c

index 38a0a99..3ebc437 100644 (file)
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
         if (!tree)
                 return NULL;
  
-       init_MUTEX(&tree->tree_lock);
+       mutex_init(&tree->tree_lock);
         spin_lock_init(&tree->hash_lock);
         /* Set the correct compare function */
         tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h

index cc51905..2a1d712 100644 (file)
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
         unsigned int depth;
  
         //unsigned int map1_size, map_size;
-       struct semaphore tree_lock;
+       struct mutex tree_lock;
  
         unsigned int pages_per_bnode;
         spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c

index 5007a41..d182438 100644 (file)
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
         fd->search_key = ptr;
         fd->key = ptr + tree->max_key_len + 2;
         dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
-       down(&tree->tree_lock);
+       mutex_lock(&tree->tree_lock);
         return 0;
  }
  
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
         hfs_bnode_put(fd->bnode);
         kfree(fd->search_key);
         dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
-       up(&fd->tree->tree_lock);
+       mutex_unlock(&fd->tree->tree_lock);
         fd->tree = NULL;
  }
  
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
                 rec = (e + b) / 2;
                 len = hfs_brec_lenoff(bnode, rec, &off);
                 keylen = hfs_brec_keylen(bnode, rec);
+               if (keylen == 0) {
+                       res = -EINVAL;
+                       goto fail;
+               }
                 hfs_bnode_read(bnode, fd->key, off, keylen);
                 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
                 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
         if (rec != e && e >= 0) {
                 len = hfs_brec_lenoff(bnode, e, &off);
                 keylen = hfs_brec_keylen(bnode, e);
+               if (keylen == 0) {
+                       res = -EINVAL;
+                       goto fail;
+               }
                 hfs_bnode_read(bnode, fd->key, off, keylen);
         }
  done:
@@ -75,6 +83,7 @@ done:
         fd->keylength = keylen;
         fd->entryoffset = off + keylen;
         fd->entrylength = len - keylen;
+fail:
         return res;
  }
  
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
  
         len = hfs_brec_lenoff(bnode, fd->record, &off);
         keylen = hfs_brec_keylen(bnode, fd->record);
+       if (keylen == 0) {
+               res = -EINVAL;
+               goto out;
+       }
         fd->keyoffset = off;
         fd->keylength = keylen;
         fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c

index ea30afc..ad57f59 100644 (file)
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
  
  int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct page *page;
         struct address_space *mapping;
         __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
                 return size;
  
         dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
-       mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-       mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+       mutex_lock(&sbi->alloc_mutex);
+       mapping = sbi->alloc_file->i_mapping;
         page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
         if (IS_ERR(page)) {
                 start = size;
@@ -150,16 +151,17 @@ done:
         set_page_dirty(page);
         kunmap(page);
         *max = offset + (curr - pptr) * 32 + i - start;
-       HFSPLUS_SB(sb).free_blocks -= *max;
+       sbi->free_blocks -= *max;
         sb->s_dirt = 1;
         dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
  out:
-       mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+       mutex_unlock(&sbi->alloc_mutex);
         return start;
  }
  
  int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct page *page;
         struct address_space *mapping;
         __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
  
         dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
         /* are all of the bits in range? */
-       if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+       if ((offset + count) > sbi->total_blocks)
                 return -2;
  
-       mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
-       mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+       mutex_lock(&sbi->alloc_mutex);
+       mapping = sbi->alloc_file->i_mapping;
         pnr = offset / PAGE_CACHE_BITS;
         page = read_mapping_page(mapping, pnr, NULL);
         pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
  out:
         set_page_dirty(page);
         kunmap(page);
-       HFSPLUS_SB(sb).free_blocks += len;
+       sbi->free_blocks += len;
         sb->s_dirt = 1;
-       mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+       mutex_unlock(&sbi->alloc_mutex);
  
         return 0;
  }
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c

index c88e5d7..2f39d05 100644 (file)
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
                 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
                 if (!recoff)
                         return 0;
-               if (node->tree->attributes & HFS_TREE_BIGKEYS)
-                       retval = hfs_bnode_read_u16(node, recoff) + 2;
-               else
-                       retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+
+               retval = hfs_bnode_read_u16(node, recoff) + 2;
+               if (retval > node->tree->max_key_len + 2) {
+                       printk(KERN_ERR "hfs: keylen %d too large\n",
+                               retval);
+                       retval = 0;
+               }
         }
         return retval;
  }
@@ -216,7 +219,7 @@ skip:
  static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
  {
         struct hfs_btree *tree;
-       struct hfs_bnode *node, *new_node;
+       struct hfs_bnode *node, *new_node, *next_node;
         struct hfs_bnode_desc node_desc;
         int num_recs, new_rec_off, new_off, old_rec_off;
         int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
         new_node->type = node->type;
         new_node->height = node->height;
  
+       if (node->next)
+               next_node = hfs_bnode_find(tree, node->next);
+       else
+               next_node = NULL;
+
+       if (IS_ERR(next_node)) {
+               hfs_bnode_put(node);
+               hfs_bnode_put(new_node);
+               return next_node;
+       }
+
         size = tree->node_size / 2 - node->num_recs * 2 - 14;
         old_rec_off = tree->node_size - 4;
         num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
                 /* panic? */
                 hfs_bnode_put(node);
                 hfs_bnode_put(new_node);
+               if (next_node)
+                       hfs_bnode_put(next_node);
                 return ERR_PTR(-ENOSPC);
         }
  
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
         hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
  
         /* update next bnode header */
-       if (new_node->next) {
-               struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+       if (next_node) {
                 next_node->prev = new_node->this;
                 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
                 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c

index e49fcee..22e4d4e 100644 (file)
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         if (!tree)
                 return NULL;
  
-       init_MUTEX(&tree->tree_lock);
+       mutex_init(&tree->tree_lock);
         spin_lock_init(&tree->hash_lock);
         tree->sb = sb;
         tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
                 goto free_tree;
         tree->inode = inode;
  
+       if (!HFSPLUS_I(tree->inode)->first_blocks) {
+               printk(KERN_ERR
+                      "hfs: invalid btree extent records (0 size).\n");
+               goto free_inode;
+       }
+
         mapping = tree->inode->i_mapping;
         page = read_mapping_page(mapping, 0, NULL);
         if (IS_ERR(page))
-               goto free_tree;
+               goto free_inode;
  
         /* Load the header */
         head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         tree->max_key_len = be16_to_cpu(head->max_key_len);
         tree->depth = be16_to_cpu(head->depth);
  
-       /* Set the correct compare function */
-       if (id == HFSPLUS_EXT_CNID) {
+       /* Verify the tree and set the correct compare function */
+       switch (id) {
+       case HFSPLUS_EXT_CNID:
+               if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+                       printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+                               tree->max_key_len);
+                       goto fail_page;
+               }
+               if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+                       printk(KERN_ERR "hfs: invalid extent btree flag\n");
+                       goto fail_page;
+               }
+
                 tree->keycmp = hfsplus_ext_cmp_key;
-       } else if (id == HFSPLUS_CAT_CNID) {
-               if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+               break;
+       case HFSPLUS_CAT_CNID:
+               if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+                       printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+                               tree->max_key_len);
+                       goto fail_page;
+               }
+               if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+                       printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+                       goto fail_page;
+               }
+
+               if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
                     (head->key_type == HFSPLUS_KEY_BINARY))
                         tree->keycmp = hfsplus_cat_bin_cmp_key;
                 else {
                         tree->keycmp = hfsplus_cat_case_cmp_key;
-                       HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+                       set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
                 }
-       } else {
+               break;
+       default:
                 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
                 goto fail_page;
         }
  
+       if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+               printk(KERN_ERR "hfs: invalid btree flag\n");
+               goto fail_page;
+       }
+
         size = tree->node_size;
         if (!is_power_of_2(size))
                 goto fail_page;
         if (!tree->node_count)
                 goto fail_page;
+
         tree->node_size_shift = ffs(size) - 1;
  
         tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
         return tree;
  
   fail_page:
-       tree->inode->i_mapping->a_ops = &hfsplus_aops;
         page_cache_release(page);
- free_tree:
+ free_inode:
+       tree->inode->i_mapping->a_ops = &hfsplus_aops;
         iput(tree->inode);
+ free_tree:
         kfree(tree);
         return NULL;
  }
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
  
         while (!tree->free_nodes) {
                 struct inode *inode = tree->inode;
+               struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
                 u32 count;
                 int res;
  
                 res = hfsplus_file_extend(inode);
                 if (res)
                         return ERR_PTR(res);
-               HFSPLUS_I(inode).phys_size = inode->i_size =
-                               (loff_t)HFSPLUS_I(inode).alloc_blocks <<
-                               HFSPLUS_SB(tree->sb).alloc_blksz_shift;
-               HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
-                                            HFSPLUS_SB(tree->sb).fs_shift;
+               hip->phys_size = inode->i_size =
+                       (loff_t)hip->alloc_blocks <<
+                               HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+               hip->fs_blocks =
+                       hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
                 inode_set_bytes(inode, inode->i_size);
                 count = inode->i_size >> tree->node_size_shift;
                 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c

index f6874ac..8af45fc 100644 (file)
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
         key->key_len = cpu_to_be16(6 + ustrlen);
  }
  
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
  {
         if (inode->i_flags & S_IMMUTABLE)
                 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
                 perms->rootflags |= HFSPLUS_FLG_APPEND;
         else
                 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-       HFSPLUS_I(inode).rootflags = perms->rootflags;
-       HFSPLUS_I(inode).userflags = perms->userflags;
+
+       perms->userflags = HFSPLUS_I(inode)->userflags;
         perms->mode = cpu_to_be16(inode->i_mode);
         perms->owner = cpu_to_be32(inode->i_uid);
         perms->group = cpu_to_be32(inode->i_gid);
+
+       if (S_ISREG(inode->i_mode))
+               perms->dev = cpu_to_be32(inode->i_nlink);
+       else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+               perms->dev = cpu_to_be32(inode->i_rdev);
+       else
+               perms->dev = 0;
  }
  
  static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
         if (S_ISDIR(inode->i_mode)) {
                 struct hfsplus_cat_folder *folder;
  
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                 memset(folder, 0, sizeof(*folder));
                 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
                 folder->id = cpu_to_be32(inode->i_ino);
-               HFSPLUS_I(inode).create_date =
+               HFSPLUS_I(inode)->create_date =
                         folder->create_date =
                         folder->content_mod_date =
                         folder->attribute_mod_date =
                         folder->access_date = hfsp_now2mt();
-               hfsplus_set_perms(inode, &folder->permissions);
-               if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+               hfsplus_cat_set_perms(inode, &folder->permissions);
+               if (inode == sbi->hidden_dir)
                         /* invisible and namelocked */
                         folder->user_info.frFlags = cpu_to_be16(0x5000);
                 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                 file->type = cpu_to_be16(HFSPLUS_FILE);
                 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
                 file->id = cpu_to_be32(cnid);
-               HFSPLUS_I(inode).create_date =
+               HFSPLUS_I(inode)->create_date =
                         file->create_date =
                         file->content_mod_date =
                         file->attribute_mod_date =
                         file->access_date = hfsp_now2mt();
                 if (cnid == inode->i_ino) {
-                       hfsplus_set_perms(inode, &file->permissions);
+                       hfsplus_cat_set_perms(inode, &file->permissions);
                         if (S_ISLNK(inode->i_mode)) {
                                 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
                                 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
                         } else {
-                               file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
-                               file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+                               file->user_info.fdType = cpu_to_be32(sbi->type);
+                               file->user_info.fdCreator = cpu_to_be32(sbi->creator);
                         }
                         if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
                                 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
                         file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
                         file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
                         file->user_info.fdFlags = cpu_to_be16(0x100);
-                       file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
-                       file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+                       file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+                       file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
                 }
                 return sizeof(*file);
         }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
  
  int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
  {
+       struct super_block *sb = dir->i_sb;
         struct hfs_find_data fd;
-       struct super_block *sb;
         hfsplus_cat_entry entry;
         int entry_size;
         int err;
  
         dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
-       sb = dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
  
         hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
         entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
  
  int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
  {
-       struct super_block *sb;
+       struct super_block *sb = dir->i_sb;
         struct hfs_find_data fd;
         struct hfsplus_fork_raw fork;
         struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
         u16 type;
  
         dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
-       sb = dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
  
         if (!str) {
                 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
                 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
         }
  
-       list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+       list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
                 struct hfsplus_readdir_data *rd =
                         list_entry(pos, struct hfsplus_readdir_data, list);
                 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
                        struct inode *src_dir, struct qstr *src_name,
                        struct inode *dst_dir, struct qstr *dst_name)
  {
-       struct super_block *sb;
+       struct super_block *sb = src_dir->i_sb;
         struct hfs_find_data src_fd, dst_fd;
         hfsplus_cat_entry entry;
         int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
  
         dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
                 dst_dir->i_ino, dst_name->name);
-       sb = src_dir->i_sb;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
         dst_fd = src_fd;
  
         /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c

index 764fd1b..d236d85 100644 (file)
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
  
         dentry->d_op = &hfsplus_dentry_operations;
         dentry->d_fsdata = NULL;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
  again:
         err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
                 cnid = be32_to_cpu(entry.file.id);
                 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
                     entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
-                   (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
-                    entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
-                   HFSPLUS_SB(sb).hidden_dir) {
+                   (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+                    entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+                   HFSPLUS_SB(sb)->hidden_dir) {
                         struct qstr str;
                         char name[32];
  
@@ -86,7 +86,8 @@ again:
                                 linkid = be32_to_cpu(entry.file.permissions.dev);
                                 str.len = sprintf(name, "iNode%d", linkid);
                                 str.name = name;
-                               hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+                               hfsplus_cat_build_key(sb, fd.search_key,
+                                       HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
                                 goto again;
                         }
                 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
         if (IS_ERR(inode))
                 return ERR_CAST(inode);
         if (S_ISREG(inode->i_mode))
-               HFSPLUS_I(inode).dev = linkid;
+               HFSPLUS_I(inode)->linkid = linkid;
  out:
         d_add(dentry, inode);
         return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
         if (filp->f_pos >= inode->i_size)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
         err = hfs_brec_find(&fd);
         if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                 err = -EIO;
                                 goto out;
                         }
-                       if (HFSPLUS_SB(sb).hidden_dir &&
-                           HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+                       if (HFSPLUS_SB(sb)->hidden_dir &&
+                           HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+                                       be32_to_cpu(entry.folder.id))
                                 goto next;
                         if (filldir(dirent, strbuf, len, filp->f_pos,
                                     be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
                 }
                 filp->private_data = rd;
                 rd->file = filp;
-               list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+               list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
         }
         memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
  out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
  {
         struct hfsplus_readdir_data *rd = file->private_data;
         if (rd) {
+               mutex_lock(&inode->i_mutex);
                 list_del(&rd->list);
+               mutex_unlock(&inode->i_mutex);
                 kfree(rd);
         }
         return 0;
  }
  
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
-                         struct nameidata *nd)
-{
-       struct inode *inode;
-       int res;
-
-       inode = hfsplus_new_inode(dir->i_sb, mode);
-       if (!inode)
-               return -ENOSPC;
-
-       res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
-       hfsplus_instantiate(dentry, inode, inode->i_ino);
-       mark_inode_dirty(inode);
-       return 0;
-}
-
  static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                         struct dentry *dst_dentry)
  {
-       struct super_block *sb = dst_dir->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
         struct inode *inode = src_dentry->d_inode;
         struct inode *src_dir = src_dentry->d_parent->d_inode;
         struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
  
         if (HFSPLUS_IS_RSRC(inode))
                 return -EPERM;
+       if (!S_ISREG(inode->i_mode))
+               return -EPERM;
  
+       mutex_lock(&sbi->vh_mutex);
         if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
                 for (;;) {
                         get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
                         str.len = sprintf(name, "iNode%d", id);
                         res = hfsplus_rename_cat(inode->i_ino,
                                                  src_dir, &src_dentry->d_name,
-                                                HFSPLUS_SB(sb).hidden_dir, &str);
+                                                sbi->hidden_dir, &str);
                         if (!res)
                                 break;
                         if (res != -EEXIST)
-                               return res;
+                               goto out;
                 }
-               HFSPLUS_I(inode).dev = id;
-               cnid = HFSPLUS_SB(sb).next_cnid++;
+               HFSPLUS_I(inode)->linkid = id;
+               cnid = sbi->next_cnid++;
                 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
                 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
                 if (res)
                         /* panic? */
-                       return res;
-               HFSPLUS_SB(sb).file_count++;
+                       goto out;
+               sbi->file_count++;
         }
-       cnid = HFSPLUS_SB(sb).next_cnid++;
+       cnid = sbi->next_cnid++;
         res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
         if (res)
-               return res;
+               goto out;
  
         inc_nlink(inode);
         hfsplus_instantiate(dst_dentry, inode, cnid);
         atomic_inc(&inode->i_count);
         inode->i_ctime = CURRENT_TIME_SEC;
         mark_inode_dirty(inode);
-       HFSPLUS_SB(sb).file_count++;
-       sb->s_dirt = 1;
-
-       return 0;
+       sbi->file_count++;
+       dst_dir->i_sb->s_dirt = 1;
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
  }
  
  static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
  {
-       struct super_block *sb = dir->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode = dentry->d_inode;
         struct qstr str;
         char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
         if (HFSPLUS_IS_RSRC(inode))
                 return -EPERM;
  
+       mutex_lock(&sbi->vh_mutex);
         cnid = (u32)(unsigned long)dentry->d_fsdata;
         if (inode->i_ino == cnid &&
-           atomic_read(&HFSPLUS_I(inode).opencnt)) {
+           atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                 str.name = name;
                 str.len = sprintf(name, "temp%lu", inode->i_ino);
                 res = hfsplus_rename_cat(inode->i_ino,
                                          dir, &dentry->d_name,
-                                        HFSPLUS_SB(sb).hidden_dir, &str);
+                                        sbi->hidden_dir, &str);
                 if (!res)
                         inode->i_flags |= S_DEAD;
-               return res;
+               goto out;
         }
         res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
         if (res)
-               return res;
+               goto out;
  
         if (inode->i_nlink > 0)
                 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                 clear_nlink(inode);
         if (!inode->i_nlink) {
                 if (inode->i_ino != cnid) {
-                       HFSPLUS_SB(sb).file_count--;
-                       if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+                       sbi->file_count--;
+                       if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
                                 res = hfsplus_delete_cat(inode->i_ino,
-                                                        HFSPLUS_SB(sb).hidden_dir,
+                                                        sbi->hidden_dir,
                                                          NULL);
                                 if (!res)
                                         hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
                 } else
                         hfsplus_delete_inode(inode);
         } else
-               HFSPLUS_SB(sb).file_count--;
+               sbi->file_count--;
         inode->i_ctime = CURRENT_TIME_SEC;
         mark_inode_dirty(inode);
-
+out:
+       mutex_unlock(&sbi->vh_mutex);
         return res;
  }
  
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-       struct inode *inode;
-       int res;
-
-       inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
-       if (!inode)
-               return -ENOSPC;
-
-       res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
-       hfsplus_instantiate(dentry, inode, inode->i_ino);
-       mark_inode_dirty(inode);
-       return 0;
-}
-
  static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
  {
-       struct inode *inode;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+       struct inode *inode = dentry->d_inode;
         int res;
  
-       inode = dentry->d_inode;
         if (inode->i_size != 2)
                 return -ENOTEMPTY;
+
+       mutex_lock(&sbi->vh_mutex);
         res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
         if (res)
-               return res;
+               goto out;
         clear_nlink(inode);
         inode->i_ctime = CURRENT_TIME_SEC;
         hfsplus_delete_inode(inode);
         mark_inode_dirty(inode);
-       return 0;
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
  }
  
  static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
                            const char *symname)
  {
-       struct super_block *sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode;
-       int res;
+       int res = -ENOSPC;
  
-       sb = dir->i_sb;
-       inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+       mutex_lock(&sbi->vh_mutex);
+       inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
         if (!inode)
-               return -ENOSPC;
+               goto out;
  
         res = page_symlink(inode, symname, strlen(symname) + 1);
-       if (res) {
-               inode->i_nlink = 0;
-               hfsplus_delete_inode(inode);
-               iput(inode);
-               return res;
-       }
+       if (res)
+               goto out_err;
  
-       mark_inode_dirty(inode);
         res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+       if (res)
+               goto out_err;
  
-       if (!res) {
-               hfsplus_instantiate(dentry, inode, inode->i_ino);
-               mark_inode_dirty(inode);
-       }
+       hfsplus_instantiate(dentry, inode, inode->i_ino);
+       mark_inode_dirty(inode);
+       goto out;
  
+out_err:
+       inode->i_nlink = 0;
+       hfsplus_delete_inode(inode);
+       iput(inode);
+out:
+       mutex_unlock(&sbi->vh_mutex);
         return res;
  }
  
  static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
                          int mode, dev_t rdev)
  {
-       struct super_block *sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
         struct inode *inode;
-       int res;
+       int res = -ENOSPC;
  
-       sb = dir->i_sb;
-       inode = hfsplus_new_inode(sb, mode);
+       mutex_lock(&sbi->vh_mutex);
+       inode = hfsplus_new_inode(dir->i_sb, mode);
         if (!inode)
-               return -ENOSPC;
+               goto out;
+
+       if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+               init_special_inode(inode, mode, rdev);
  
         res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
         if (res) {
                 inode->i_nlink = 0;
                 hfsplus_delete_inode(inode);
                 iput(inode);
-               return res;
+               goto out;
         }
-       init_special_inode(inode, mode, rdev);
+
         hfsplus_instantiate(dentry, inode, inode->i_ino);
         mark_inode_dirty(inode);
+out:
+       mutex_unlock(&sbi->vh_mutex);
+       return res;
+}
  
-       return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+                         struct nameidata *nd)
+{
+       return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
  }
  
  static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
  
         /* Unlink destination if it already exists */
         if (new_dentry->d_inode) {
-               res = hfsplus_unlink(new_dir, new_dentry);
+               if (S_ISDIR(new_dentry->d_inode->i_mode))
+                       res = hfsplus_rmdir(new_dir, new_dentry);
+               else
+                       res = hfsplus_unlink(new_dir, new_dentry);
                 if (res)
                         return res;
         }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c

index 0022eec..0c9cb18 100644 (file)
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
  
  static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res;
  
-       hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
-                             HFSPLUS_IS_RSRC(inode) ?  HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+       WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+       hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+                             HFSPLUS_IS_RSRC(inode) ?
+                               HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
         res = hfs_brec_find(fd);
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+       if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
                 if (res != -ENOENT)
                         return;
-               hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hfs_brec_insert(fd, hip->cached_extents,
+                               sizeof(hfsplus_extent_rec));
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
         } else {
                 if (res)
                         return;
-               hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
-               HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+               hfs_bnode_write(fd->bnode, hip->cached_extents,
+                               fd->entryoffset, fd->entrylength);
+               hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
         }
  }
  
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
  {
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+       if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
                 struct hfs_find_data fd;
  
-               hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+               hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
                 __hfsplus_ext_write_extent(inode, &fd);
                 hfs_find_exit(&fd);
         }
  }
  
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+       mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+       hfsplus_ext_write_extent_locked(inode);
+       mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
  static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
                                             struct hfsplus_extent *extent,
                                             u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
  
  static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res;
  
-       if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+       WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+       if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
                 __hfsplus_ext_write_extent(inode, fd);
  
-       res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
-                                       block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+       res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+                                       block, HFSPLUS_IS_RSRC(inode) ?
+                                               HFSPLUS_TYPE_RSRC :
+                                               HFSPLUS_TYPE_DATA);
         if (!res) {
-               HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
-               HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+               hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+               hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
         } else {
-               HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hip->cached_start = hip->cached_blocks = 0;
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
         }
         return res;
  }
  
  static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
  {
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         struct hfs_find_data fd;
         int res;
  
-       if (block >= HFSPLUS_I(inode).cached_start &&
-           block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+       if (block >= hip->cached_start &&
+           block < hip->cached_start + hip->cached_blocks)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
         res = __hfsplus_ext_cache_extent(&fd, inode, block);
         hfs_find_exit(&fd);
         return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
  int hfsplus_get_block(struct inode *inode, sector_t iblock,
                       struct buffer_head *bh_result, int create)
  {
-       struct super_block *sb;
+       struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         int res = -EIO;
         u32 ablock, dblock, mask;
         int shift;
  
-       sb = inode->i_sb;
-
         /* Convert inode block to disk allocation block */
-       shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
-       ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+       shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+       ablock = iblock >> sbi->fs_shift;
  
-       if (iblock >= HFSPLUS_I(inode).fs_blocks) {
-               if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+       if (iblock >= hip->fs_blocks) {
+               if (iblock > hip->fs_blocks || !create)
                         return -EIO;
-               if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+               if (ablock >= hip->alloc_blocks) {
                         res = hfsplus_file_extend(inode);
                         if (res)
                                 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
         } else
                 create = 0;
  
-       if (ablock < HFSPLUS_I(inode).first_blocks) {
-               dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+       if (ablock < hip->first_blocks) {
+               dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
                 goto done;
         }
  
         if (inode->i_ino == HFSPLUS_EXT_CNID)
                 return -EIO;
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
+       mutex_lock(&hip->extents_lock);
         res = hfsplus_ext_read_extent(inode, ablock);
         if (!res) {
-               dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
-                                            HFSPLUS_I(inode).cached_start);
+               dblock = hfsplus_ext_find_block(hip->cached_extents,
+                                               ablock - hip->cached_start);
         } else {
-               mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+               mutex_unlock(&hip->extents_lock);
                 return -EIO;
         }
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
  
  done:
         dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
-       mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
-       map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+       mask = (1 << sbi->fs_shift) - 1;
+       map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
         if (create) {
                 set_buffer_new(bh_result);
-               HFSPLUS_I(inode).phys_size += sb->s_blocksize;
-               HFSPLUS_I(inode).fs_blocks++;
+               hip->phys_size += sb->s_blocksize;
+               hip->fs_blocks++;
                 inode_add_bytes(inode, sb->s_blocksize);
                 mark_inode_dirty(inode);
         }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
         if (total_blocks == blocks)
                 return 0;
  
-       hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
         do {
                 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
                                                 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
  int hfsplus_file_extend(struct inode *inode)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         u32 start, len, goal;
         int res;
  
-       if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+       if (sbi->alloc_file->i_size * 8 <
+           sbi->total_blocks - sbi->free_blocks + 8) {
                 // extend alloc file
-               printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
-                       HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+               printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+                               sbi->alloc_file->i_size * 8,
+                               sbi->total_blocks, sbi->free_blocks);
                 return -ENOSPC;
         }
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
-       if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
-               goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+       mutex_lock(&hip->extents_lock);
+       if (hip->alloc_blocks == hip->first_blocks)
+               goal = hfsplus_ext_lastblock(hip->first_extents);
         else {
-               res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+               res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
                 if (res)
                         goto out;
-               goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+               goal = hfsplus_ext_lastblock(hip->cached_extents);
         }
  
-       len = HFSPLUS_I(inode).clump_blocks;
-       start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
-       if (start >= HFSPLUS_SB(sb).total_blocks) {
+       len = hip->clump_blocks;
+       start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+       if (start >= sbi->total_blocks) {
                 start = hfsplus_block_allocate(sb, goal, 0, &len);
                 if (start >= goal) {
                         res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
         }
  
         dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
-       if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
-               if (!HFSPLUS_I(inode).first_blocks) {
+
+       if (hip->alloc_blocks <= hip->first_blocks) {
+               if (!hip->first_blocks) {
                         dprint(DBG_EXTENT, "first extents\n");
                         /* no extents yet */
-                       HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
-                       HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+                       hip->first_extents[0].start_block = cpu_to_be32(start);
+                       hip->first_extents[0].block_count = cpu_to_be32(len);
                         res = 0;
                 } else {
                         /* try to append to extents in inode */
-                       res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
-                                                HFSPLUS_I(inode).alloc_blocks,
+                       res = hfsplus_add_extent(hip->first_extents,
+                                                hip->alloc_blocks,
                                                  start, len);
                         if (res == -ENOSPC)
                                 goto insert_extent;
                 }
                 if (!res) {
-                       hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-                       HFSPLUS_I(inode).first_blocks += len;
+                       hfsplus_dump_extent(hip->first_extents);
+                       hip->first_blocks += len;
                 }
         } else {
-               res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
-                                        HFSPLUS_I(inode).alloc_blocks -
-                                        HFSPLUS_I(inode).cached_start,
+               res = hfsplus_add_extent(hip->cached_extents,
+                                        hip->alloc_blocks - hip->cached_start,
                                          start, len);
                 if (!res) {
-                       hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-                       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
-                       HFSPLUS_I(inode).cached_blocks += len;
+                       hfsplus_dump_extent(hip->cached_extents);
+                       hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+                       hip->cached_blocks += len;
                 } else if (res == -ENOSPC)
                         goto insert_extent;
         }
  out:
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
         if (!res) {
-               HFSPLUS_I(inode).alloc_blocks += len;
+               hip->alloc_blocks += len;
                 mark_inode_dirty(inode);
         }
         return res;
  
  insert_extent:
         dprint(DBG_EXTENT, "insert new extent\n");
-       hfsplus_ext_write_extent(inode);
+       hfsplus_ext_write_extent_locked(inode);
  
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
-       HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
-       hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
-       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
-       HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
-       HFSPLUS_I(inode).cached_blocks = len;
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->cached_extents[0].start_block = cpu_to_be32(start);
+       hip->cached_extents[0].block_count = cpu_to_be32(len);
+       hfsplus_dump_extent(hip->cached_extents);
+       hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+       hip->cached_start = hip->alloc_blocks;
+       hip->cached_blocks = len;
  
         res = 0;
         goto out;
@@ -437,13 +461,15 @@ insert_extent:
  void hfsplus_file_truncate(struct inode *inode)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         struct hfs_find_data fd;
         u32 alloc_cnt, blk_cnt, start;
         int res;
  
-       dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
-              (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
-       if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+       dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+               inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+       if (inode->i_size > hip->phys_size) {
                 struct address_space *mapping = inode->i_mapping;
                 struct page *page;
                 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
                         return;
                 mark_inode_dirty(inode);
                 return;
-       } else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+       } else if (inode->i_size == hip->phys_size)
                 return;
  
-       blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+       blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+                       HFSPLUS_SB(sb)->alloc_blksz_shift;
+       alloc_cnt = hip->alloc_blocks;
         if (blk_cnt == alloc_cnt)
                 goto out;
  
-       mutex_lock(&HFSPLUS_I(inode).extents_lock);
-       hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+       mutex_lock(&hip->extents_lock);
+       hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
         while (1) {
-               if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
-                       hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+               if (alloc_cnt == hip->first_blocks) {
+                       hfsplus_free_extents(sb, hip->first_extents,
                                              alloc_cnt, alloc_cnt - blk_cnt);
-                       hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
-                       HFSPLUS_I(inode).first_blocks = blk_cnt;
+                       hfsplus_dump_extent(hip->first_extents);
+                       hip->first_blocks = blk_cnt;
                         break;
                 }
                 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
                 if (res)
                         break;
-               start = HFSPLUS_I(inode).cached_start;
-               hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+               start = hip->cached_start;
+               hfsplus_free_extents(sb, hip->cached_extents,
                                      alloc_cnt - start, alloc_cnt - blk_cnt);
-               hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+               hfsplus_dump_extent(hip->cached_extents);
                 if (blk_cnt > start) {
-                       HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+                       hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
                         break;
                 }
                 alloc_cnt = start;
-               HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
-               HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+               hip->cached_start = hip->cached_blocks = 0;
+               hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
                 hfs_brec_remove(&fd);
         }
         hfs_find_exit(&fd);
-       mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+       mutex_unlock(&hip->extents_lock);
  
-       HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+       hip->alloc_blocks = blk_cnt;
  out:
-       HFSPLUS_I(inode).phys_size = inode->i_size;
-       HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-       inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+       hip->phys_size = inode->i_size;
+       hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+       inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
         mark_inode_dirty(inode);
  }
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h

index dc856be..cb3653e 100644 (file)
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
         unsigned int depth;
  
         //unsigned int map1_size, map_size;
-       struct semaphore tree_lock;
+       struct mutex tree_lock;
  
         unsigned int pages_per_bnode;
         spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
         u32 sect_count;
         int fs_shift;
  
-       /* Stuff in host order from Vol Header */
+       /* immutable data from the volume header */
         u32 alloc_blksz;
         int alloc_blksz_shift;
         u32 total_blocks;
+       u32 data_clump_blocks, rsrc_clump_blocks;
+
+       /* mutable data from the volume header, protected by alloc_mutex */
         u32 free_blocks;
-       u32 next_alloc;
+       struct mutex alloc_mutex;
+
+       /* mutable data from the volume header, protected by vh_mutex */
         u32 next_cnid;
         u32 file_count;
         u32 folder_count;
-       u32 data_clump_blocks, rsrc_clump_blocks;
+       struct mutex vh_mutex;
  
         /* Config options */
         u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
         int part, session;
  
         unsigned long flags;
-
-       struct hlist_head rsrc_inodes;
  };
  
-#define HFSPLUS_SB_WRITEBACKUP 0x0001
-#define HFSPLUS_SB_NODECOMPOSE 0x0002
-#define HFSPLUS_SB_FORCE       0x0004
-#define HFSPLUS_SB_HFSX                0x0008
-#define HFSPLUS_SB_CASEFOLD    0x0010
+#define HFSPLUS_SB_WRITEBACKUP 0
+#define HFSPLUS_SB_NODECOMPOSE 1
+#define HFSPLUS_SB_FORCE       2
+#define HFSPLUS_SB_HFSX                3
+#define HFSPLUS_SB_CASEFOLD    4
  
  
  struct hfsplus_inode_info {
-       struct mutex extents_lock;
-       u32 clump_blocks, alloc_blocks;
-       sector_t fs_blocks;
-       /* Allocation extents from catalog record or volume header */
-       hfsplus_extent_rec first_extents;
-       u32 first_blocks;
-       hfsplus_extent_rec cached_extents;
-       u32 cached_start, cached_blocks;
         atomic_t opencnt;
  
-       struct inode *rsrc_inode;
+       /*
+        * Extent allocation information, protected by extents_lock.
+        */
+       u32 first_blocks;
+       u32 clump_blocks;
+       u32 alloc_blocks;
+       u32 cached_start;
+       u32 cached_blocks;
+       hfsplus_extent_rec first_extents;
+       hfsplus_extent_rec cached_extents;
         unsigned long flags;
+       struct mutex extents_lock;
  
+       /*
+        * Immutable data.
+        */
+       struct inode *rsrc_inode;
         __be32 create_date;
-       /* Device number in hfsplus_permissions in catalog */
-       u32 dev;
-       /* BSD system and user file flags */
-       u8 rootflags;
-       u8 userflags;
  
+       /*
+        * Protected by sbi->vh_mutex.
+        */
+       u32 linkid;
+
+       /*
+        * Protected by i_mutex.
+        */
+       sector_t fs_blocks;
+       u8 userflags;           /* BSD user file flags */
         struct list_head open_dir_list;
         loff_t phys_size;
+
         struct inode vfs_inode;
  };
  
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
  #define HFSPLUS_FLG_EXT_DIRTY  0x0002
  #define HFSPLUS_FLG_EXT_NEW    0x0004
  
-#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode)   (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode)   (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
  
  struct hfs_find_data {
         /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
  int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
  int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
                        struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
  
  /* dir.c */
  extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
  int hfs_part_find(struct super_block *, sector_t *, sector_t *);
  
  /* access macros */
-/*
  static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
  {
         return sb->s_fs_info;
  }
+
  static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
  {
         return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
  }
-*/
-#define HFSPLUS_SB(super)      (*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode)       (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-
-#if 1
-#define hfsplus_kmap(p)                ({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p)      ({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p)                kmap(p)
-#define hfsplus_kunmap(p)      kunmap(p)
-#endif
  
  #define sb_bread512(sb, sec, data) ({                  \
         struct buffer_head *__bh;                       \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
  #define hfsp_ut2mt(t)          __hfsp_ut2mt((t).tv_sec)
  #define hfsp_now2mt()          __hfsp_ut2mt(get_seconds())
  
-#define kdev_t_to_nr(x)                (x)
-
  #endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h

index fe99fe8..6892899 100644 (file)
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
         struct hfsplus_unistr name;
  } __packed;
  
+#define HFSPLUS_CAT_KEYLEN     (sizeof(struct hfsplus_cat_key))
  
  /* Structs from hfs.h */
  struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
         __be32 start_block;
  } __packed;
  
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN     sizeof(struct hfsplus_ext_key)
  
  /* HFS+ generic BTree key */
  typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c

index c5a979d..7844928 100644 (file)
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
         *pagep = NULL;
         ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
                                 hfsplus_get_block,
-                               &HFSPLUS_I(mapping->host).phys_size);
+                               &HFSPLUS_I(mapping->host)->phys_size);
         if (unlikely(ret)) {
                 loff_t isize = mapping->host->i_size;
                 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
  
         switch (inode->i_ino) {
         case HFSPLUS_EXT_CNID:
-               tree = HFSPLUS_SB(sb).ext_tree;
+               tree = HFSPLUS_SB(sb)->ext_tree;
                 break;
         case HFSPLUS_CAT_CNID:
-               tree = HFSPLUS_SB(sb).cat_tree;
+               tree = HFSPLUS_SB(sb)->cat_tree;
                 break;
         case HFSPLUS_ATTR_CNID:
-               tree = HFSPLUS_SB(sb).attr_tree;
+               tree = HFSPLUS_SB(sb)->attr_tree;
                 break;
         default:
                 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         struct hfs_find_data fd;
         struct super_block *sb = dir->i_sb;
         struct inode *inode = NULL;
+       struct hfsplus_inode_info *hip;
         int err;
  
         if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
                 goto out;
  
-       inode = HFSPLUS_I(dir).rsrc_inode;
+       inode = HFSPLUS_I(dir)->rsrc_inode;
         if (inode)
                 goto out;
  
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
         if (!inode)
                 return ERR_PTR(-ENOMEM);
  
+       hip = HFSPLUS_I(inode);
         inode->i_ino = dir->i_ino;
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+       INIT_LIST_HEAD(&hip->open_dir_list);
+       mutex_init(&hip->extents_lock);
+       hip->flags = HFSPLUS_FLG_RSRC;
  
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
         err = hfsplus_find_cat(sb, dir->i_ino, &fd);
         if (!err)
                 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
                 iput(inode);
                 return ERR_PTR(err);
         }
-       HFSPLUS_I(inode).rsrc_inode = dir;
-       HFSPLUS_I(dir).rsrc_inode = inode;
+       hip->rsrc_inode = dir;
+       HFSPLUS_I(dir)->rsrc_inode = inode;
         igrab(dir);
-       hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+
+       /*
+        * __mark_inode_dirty expects inodes to be hashed.  Since we don't
+        * want resource fork inodes in the regular inode space, we make them
+        * appear hashed, but do not put on any lists.  hlist_del()
+        * will work fine and require no locking.
+        */
+       inode->i_hash.pprev = &inode->i_hash.next;
+
         mark_inode_dirty(inode);
  out:
         d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
  
  static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
  {
-       struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
         u16 mode;
  
         mode = be16_to_cpu(perms->mode);
  
         inode->i_uid = be32_to_cpu(perms->owner);
         if (!inode->i_uid && !mode)
-               inode->i_uid = HFSPLUS_SB(sb).uid;
+               inode->i_uid = sbi->uid;
  
         inode->i_gid = be32_to_cpu(perms->group);
         if (!inode->i_gid && !mode)
-               inode->i_gid = HFSPLUS_SB(sb).gid;
+               inode->i_gid = sbi->gid;
  
         if (dir) {
-               mode = mode ? (mode & S_IALLUGO) :
-                       (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+               mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
                 mode |= S_IFDIR;
         } else if (!mode)
-               mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
-                       ~(HFSPLUS_SB(sb).umask));
+               mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
         inode->i_mode = mode;
  
-       HFSPLUS_I(inode).rootflags = perms->rootflags;
-       HFSPLUS_I(inode).userflags = perms->userflags;
+       HFSPLUS_I(inode)->userflags = perms->userflags;
         if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
                 inode->i_flags |= S_IMMUTABLE;
         else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
                 inode->i_flags &= ~S_APPEND;
  }
  
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
-       if (inode->i_flags & S_IMMUTABLE)
-               perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
-       else
-               perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-       if (inode->i_flags & S_APPEND)
-               perms->rootflags |= HFSPLUS_FLG_APPEND;
-       else
-               perms->rootflags &= ~HFSPLUS_FLG_APPEND;
-       perms->userflags = HFSPLUS_I(inode).userflags;
-       perms->mode = cpu_to_be16(inode->i_mode);
-       perms->owner = cpu_to_be32(inode->i_uid);
-       perms->group = cpu_to_be32(inode->i_gid);
-       perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
-
  static int hfsplus_file_open(struct inode *inode, struct file *file)
  {
         if (HFSPLUS_IS_RSRC(inode))
-               inode = HFSPLUS_I(inode).rsrc_inode;
+               inode = HFSPLUS_I(inode)->rsrc_inode;
         if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                 return -EOVERFLOW;
-       atomic_inc(&HFSPLUS_I(inode).opencnt);
+       atomic_inc(&HFSPLUS_I(inode)->opencnt);
         return 0;
  }
  
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
         struct super_block *sb = inode->i_sb;
  
         if (HFSPLUS_IS_RSRC(inode))
-               inode = HFSPLUS_I(inode).rsrc_inode;
-       if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+               inode = HFSPLUS_I(inode)->rsrc_inode;
+       if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
                 mutex_lock(&inode->i_mutex);
                 hfsplus_file_truncate(inode);
                 if (inode->i_flags & S_DEAD) {
-                       hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+                       hfsplus_delete_cat(inode->i_ino,
+                                          HFSPLUS_SB(sb)->hidden_dir, NULL);
                         hfsplus_delete_inode(inode);
                 }
                 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
  
  struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct inode *inode = new_inode(sb);
+       struct hfsplus_inode_info *hip;
+
         if (!inode)
                 return NULL;
  
-       inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+       inode->i_ino = sbi->next_cnid++;
         inode->i_mode = mode;
         inode->i_uid = current_fsuid();
         inode->i_gid = current_fsgid();
         inode->i_nlink = 1;
         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-       HFSPLUS_I(inode).flags = 0;
-       memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).alloc_blocks = 0;
-       HFSPLUS_I(inode).first_blocks = 0;
-       HFSPLUS_I(inode).cached_start = 0;
-       HFSPLUS_I(inode).cached_blocks = 0;
-       HFSPLUS_I(inode).phys_size = 0;
-       HFSPLUS_I(inode).fs_blocks = 0;
-       HFSPLUS_I(inode).rsrc_inode = NULL;
+
+       hip = HFSPLUS_I(inode);
+       INIT_LIST_HEAD(&hip->open_dir_list);
+       mutex_init(&hip->extents_lock);
+       atomic_set(&hip->opencnt, 0);
+       hip->flags = 0;
+       memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->alloc_blocks = 0;
+       hip->first_blocks = 0;
+       hip->cached_start = 0;
+       hip->cached_blocks = 0;
+       hip->phys_size = 0;
+       hip->fs_blocks = 0;
+       hip->rsrc_inode = NULL;
         if (S_ISDIR(inode->i_mode)) {
                 inode->i_size = 2;
-               HFSPLUS_SB(sb).folder_count++;
+               sbi->folder_count++;
                 inode->i_op = &hfsplus_dir_inode_operations;
                 inode->i_fop = &hfsplus_dir_operations;
         } else if (S_ISREG(inode->i_mode)) {
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
                 inode->i_op = &hfsplus_file_inode_operations;
                 inode->i_fop = &hfsplus_file_operations;
                 inode->i_mapping->a_ops = &hfsplus_aops;
-               HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+               hip->clump_blocks = sbi->data_clump_blocks;
         } else if (S_ISLNK(inode->i_mode)) {
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
                 inode->i_op = &page_symlink_inode_operations;
                 inode->i_mapping->a_ops = &hfsplus_aops;
-               HFSPLUS_I(inode).clump_blocks = 1;
+               hip->clump_blocks = 1;
         } else
-               HFSPLUS_SB(sb).file_count++;
+               sbi->file_count++;
         insert_inode_hash(inode);
         mark_inode_dirty(inode);
         sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
         struct super_block *sb = inode->i_sb;
  
         if (S_ISDIR(inode->i_mode)) {
-               HFSPLUS_SB(sb).folder_count--;
+               HFSPLUS_SB(sb)->folder_count--;
                 sb->s_dirt = 1;
                 return;
         }
-       HFSPLUS_SB(sb).file_count--;
+       HFSPLUS_SB(sb)->file_count--;
         if (S_ISREG(inode->i_mode)) {
                 if (!inode->i_nlink) {
                         inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
  void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
  {
         struct super_block *sb = inode->i_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         u32 count;
         int i;
  
-       memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
-              sizeof(hfsplus_extent_rec));
+       memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
         for (count = 0, i = 0; i < 8; i++)
                 count += be32_to_cpu(fork->extents[i].block_count);
-       HFSPLUS_I(inode).first_blocks = count;
-       memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
-       HFSPLUS_I(inode).cached_start = 0;
-       HFSPLUS_I(inode).cached_blocks = 0;
-
-       HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
-       inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
-       HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-       inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
-       HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_I(inode).clump_blocks)
-               HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
-                               HFSPLUS_SB(sb).data_clump_blocks;
+       hip->first_blocks = count;
+       memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+       hip->cached_start = 0;
+       hip->cached_blocks = 0;
+
+       hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+       hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+       hip->fs_blocks =
+               (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+       inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+       hip->clump_blocks =
+               be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+       if (!hip->clump_blocks) {
+               hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+                       sbi->rsrc_clump_blocks :
+                       sbi->data_clump_blocks;
+       }
  }
  
  void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
  {
-       memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+       memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
                sizeof(hfsplus_extent_rec));
         fork->total_size = cpu_to_be64(inode->i_size);
-       fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+       fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
  }
  
  int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
  
         type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
  
-       HFSPLUS_I(inode).dev = 0;
+       HFSPLUS_I(inode)->linkid = 0;
         if (type == HFSPLUS_FOLDER) {
                 struct hfsplus_cat_folder *folder = &entry.folder;
  
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                 inode->i_atime = hfsp_mt2ut(folder->access_date);
                 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
                 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
-               HFSPLUS_I(inode).create_date = folder->create_date;
-               HFSPLUS_I(inode).fs_blocks = 0;
+               HFSPLUS_I(inode)->create_date = folder->create_date;
+               HFSPLUS_I(inode)->fs_blocks = 0;
                 inode->i_op = &hfsplus_dir_inode_operations;
                 inode->i_fop = &hfsplus_dir_operations;
         } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
                 inode->i_atime = hfsp_mt2ut(file->access_date);
                 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
                 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
-               HFSPLUS_I(inode).create_date = file->create_date;
+               HFSPLUS_I(inode)->create_date = file->create_date;
         } else {
                 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
                 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
         hfsplus_cat_entry entry;
  
         if (HFSPLUS_IS_RSRC(inode))
-               main_inode = HFSPLUS_I(inode).rsrc_inode;
+               main_inode = HFSPLUS_I(inode)->rsrc_inode;
  
         if (!main_inode->i_nlink)
                 return 0;
  
-       if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+       if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
                 /* panic? */
                 return -EIO;
  
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
                 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_folder));
                 /* simple node checks? */
-               hfsplus_set_perms(inode, &folder->permissions);
+               hfsplus_cat_set_perms(inode, &folder->permissions);
                 folder->access_date = hfsp_ut2mt(inode->i_atime);
                 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
                 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
                 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
                                         sizeof(struct hfsplus_cat_file));
                 hfsplus_inode_write_fork(inode, &file->data_fork);
-               if (S_ISREG(inode->i_mode))
-                       HFSPLUS_I(inode).dev = inode->i_nlink;
-               if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                       HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
-               hfsplus_set_perms(inode, &file->permissions);
+               hfsplus_cat_set_perms(inode, &file->permissions);
                 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
                         file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
                 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c

index ac405f0..5b4667e 100644 (file)
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
  #include <linux/mount.h>
  #include <linux/sched.h>
  #include <linux/xattr.h>
-#include <linux/smp_lock.h>
  #include <asm/uaccess.h>
  #include "hfsplus_fs.h"
  
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
  {
-       struct inode *inode = filp->f_path.dentry->d_inode;
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+       unsigned int flags = 0;
+
+       if (inode->i_flags & S_IMMUTABLE)
+               flags |= FS_IMMUTABLE_FL;
+       if (inode->i_flags |= S_APPEND)
+               flags |= FS_APPEND_FL;
+       if (hip->userflags & HFSPLUS_FLG_NODUMP)
+               flags |= FS_NODUMP_FL;
+
+       return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
         unsigned int flags;
+       int err = 0;
  
-       lock_kernel();
-       switch (cmd) {
-       case HFSPLUS_IOC_EXT2_GETFLAGS:
-               flags = 0;
-               if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-                       flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
-               if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-                       flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
-               if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-                       flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
-               return put_user(flags, (int __user *)arg);
-       case HFSPLUS_IOC_EXT2_SETFLAGS: {
-               int err = 0;
-               err = mnt_want_write(filp->f_path.mnt);
-               if (err) {
-                       unlock_kernel();
-                       return err;
-               }
+       err = mnt_want_write(file->f_path.mnt);
+       if (err)
+               goto out;
  
-               if (!is_owner_or_cap(inode)) {
-                       err = -EACCES;
-                       goto setflags_out;
-               }
-               if (get_user(flags, (int __user *)arg)) {
-                       err = -EFAULT;
-                       goto setflags_out;
-               }
-               if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
-                   HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
-                       if (!capable(CAP_LINUX_IMMUTABLE)) {
-                               err = -EPERM;
-                               goto setflags_out;
-                       }
-               }
+       if (!is_owner_or_cap(inode)) {
+               err = -EACCES;
+               goto out_drop_write;
+       }
  
-               /* don't silently ignore unsupported ext2 flags */
-               if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
-                       err = -EOPNOTSUPP;
-                       goto setflags_out;
-               }
-               if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
-                       inode->i_flags |= S_IMMUTABLE;
-                       HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
-               } else {
-                       inode->i_flags &= ~S_IMMUTABLE;
-                       HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
-               }
-               if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
-                       inode->i_flags |= S_APPEND;
-                       HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
-               } else {
-                       inode->i_flags &= ~S_APPEND;
-                       HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+       if (get_user(flags, user_flags)) {
+               err = -EFAULT;
+               goto out_drop_write;
+       }
+
+       mutex_lock(&inode->i_mutex);
+
+       if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+           inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+               if (!capable(CAP_LINUX_IMMUTABLE)) {
+                       err = -EPERM;
+                       goto out_unlock_inode;
                 }
-               if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
-                       HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
-               else
-                       HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
-               inode->i_ctime = CURRENT_TIME_SEC;
-               mark_inode_dirty(inode);
-setflags_out:
-               mnt_drop_write(filp->f_path.mnt);
-               unlock_kernel();
-               return err;
         }
+
+       /* don't silently ignore unsupported ext2 flags */
+       if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+               err = -EOPNOTSUPP;
+               goto out_unlock_inode;
+       }
+
+       if (flags & FS_IMMUTABLE_FL)
+               inode->i_flags |= S_IMMUTABLE;
+       else
+               inode->i_flags &= ~S_IMMUTABLE;
+
+       if (flags & FS_APPEND_FL)
+               inode->i_flags |= S_APPEND;
+       else
+               inode->i_flags &= ~S_APPEND;
+
+       if (flags & FS_NODUMP_FL)
+               hip->userflags |= HFSPLUS_FLG_NODUMP;
+       else
+               hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+       inode->i_ctime = CURRENT_TIME_SEC;
+       mark_inode_dirty(inode);
+
+out_unlock_inode:
+       mutex_lock(&inode->i_mutex);
+out_drop_write:
+       mnt_drop_write(file->f_path.mnt);
+out:
+       return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       void __user *argp = (void __user *)arg;
+
+       switch (cmd) {
+       case HFSPLUS_IOC_EXT2_GETFLAGS:
+               return hfsplus_ioctl_getflags(file, argp);
+       case HFSPLUS_IOC_EXT2_SETFLAGS:
+               return hfsplus_ioctl_setflags(file, argp);
         default:
-               unlock_kernel();
                 return -ENOTTY;
         }
  }
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
         if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
                 return -EOPNOTSUPP;
  
-       res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+       res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
         if (res)
                 return res;
         res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                 return -EOPNOTSUPP;
  
         if (size) {
-               res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+               res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
                 if (res)
                         return res;
                 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
                 } else
                         res = size ? -ERANGE : 4;
         } else
-               res = -ENODATA;
+               res = -EOPNOTSUPP;
  out:
         if (size)
                 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c

index 572628b..f9ab276 100644 (file)
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
                         kfree(p);
                         break;
                 case opt_decompose:
-                       sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+                       clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                         break;
                 case opt_nodecompose:
-                       sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+                       set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
                         break;
                 case opt_force:
-                       sbi->flags |= HFSPLUS_SB_FORCE;
+                       set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
                         break;
                 default:
                         return 0;
@@ -171,7 +171,7 @@ done:
  
  int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
  {
-       struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
  
         if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
                 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
                 seq_printf(seq, ",session=%u", sbi->session);
         if (sbi->nls)
                 seq_printf(seq, ",nls=%s", sbi->nls->charset);
-       if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+       if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
                 seq_printf(seq, ",nodecompose");
         return 0;
  }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c

index 1528a6f..208b16c 100644 (file)
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
  int hfs_part_find(struct super_block *sb,
                   sector_t *part_start, sector_t *part_size)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct buffer_head *bh;
         __be16 *data;
         int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
                 for (i = 0; i < size; p++, i++) {
                         if (p->pdStart && p->pdSize &&
                             p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
-                           (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+                           (sbi->part < 0 || sbi->part == i)) {
                                 *part_start += be32_to_cpu(p->pdStart);
                                 *part_size = be32_to_cpu(p->pdSize);
                                 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
                 size = be32_to_cpu(pm->pmMapBlkCnt);
                 for (i = 0; i < size;) {
                         if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
-                           (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+                           (sbi->part < 0 || sbi->part == i)) {
                                 *part_start += be32_to_cpu(pm->pmPyPartStart);
                                 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
                                 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c

index 3b55c05..9a88d75 100644 (file)
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
  #include <linux/pagemap.h>
  #include <linux/fs.h>
  #include <linux/slab.h>
-#include <linux/smp_lock.h>
  #include <linux/vfs.h>
  #include <linux/nls.h>
  
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
  
  #include "hfsplus_fs.h"
  
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
  {
-       struct hfs_find_data fd;
-       struct hfsplus_vh *vhdr;
-       struct inode *inode;
-       long err = -EIO;
-
-       inode = iget_locked(sb, ino);
-       if (!inode)
-               return ERR_PTR(-ENOMEM);
-       if (!(inode->i_state & I_NEW))
-               return inode;
+       struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
  
-       INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
-       mutex_init(&HFSPLUS_I(inode).extents_lock);
-       HFSPLUS_I(inode).flags = 0;
-       HFSPLUS_I(inode).rsrc_inode = NULL;
-       atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-
-       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-       read_inode:
-               hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
-               err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
-               if (!err)
-                       err = hfsplus_cat_read_inode(inode, &fd);
-               hfs_find_exit(&fd);
-               if (err)
-                       goto bad_inode;
-               goto done;
-       }
-       vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
-       switch(inode->i_ino) {
-       case HFSPLUS_ROOT_CNID:
-               goto read_inode;
+       switch (inode->i_ino) {
         case HFSPLUS_EXT_CNID:
                 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
                 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
                 inode->i_mapping->a_ops = &hfsplus_btree_aops;
                 break;
         default:
-               goto bad_inode;
+               return -EIO;
+       }
+
+       return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+       struct hfs_find_data fd;
+       struct inode *inode;
+       int err;
+
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+
+       INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+       mutex_init(&HFSPLUS_I(inode)->extents_lock);
+       HFSPLUS_I(inode)->flags = 0;
+       HFSPLUS_I(inode)->rsrc_inode = NULL;
+       atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+           inode->i_ino == HFSPLUS_ROOT_CNID) {
+               hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+               err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+               if (!err)
+                       err = hfsplus_cat_read_inode(inode, &fd);
+               hfs_find_exit(&fd);
+       } else {
+               err = hfsplus_system_read_inode(inode);
+       }
+
+       if (err) {
+               iget_failed(inode);
+               return ERR_PTR(err);
         }
  
-done:
         unlock_new_inode(inode);
         return inode;
-
-bad_inode:
-       iget_failed(inode);
-       return ERR_PTR(err);
  }
  
-static int hfsplus_write_inode(struct inode *inode,
-               struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
  {
-       struct hfsplus_vh *vhdr;
-       int ret = 0;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+       struct hfsplus_vh *vhdr = sbi->s_vhdr;
+       struct hfsplus_fork_raw *fork;
+       struct hfs_btree *tree = NULL;
  
-       dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
-       hfsplus_ext_write_extent(inode);
-       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
-               return hfsplus_cat_write_inode(inode);
-       }
-       vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
         switch (inode->i_ino) {
-       case HFSPLUS_ROOT_CNID:
-               ret = hfsplus_cat_write_inode(inode);
-               break;
         case HFSPLUS_EXT_CNID:
-               if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->ext_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+               fork = &vhdr->ext_file;
+               tree = sbi->ext_tree;
                 break;
         case HFSPLUS_CAT_CNID:
-               if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->cat_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+               fork = &vhdr->cat_file;
+               tree = sbi->cat_tree;
                 break;
         case HFSPLUS_ALLOC_CNID:
-               if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+               fork = &vhdr->alloc_file;
                 break;
         case HFSPLUS_START_CNID:
-               if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->start_file);
+               fork = &vhdr->start_file;
                 break;
         case HFSPLUS_ATTR_CNID:
-               if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
-                       HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
-                       inode->i_sb->s_dirt = 1;
-               }
-               hfsplus_inode_write_fork(inode, &vhdr->attr_file);
-               hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
-               break;
+               fork = &vhdr->attr_file;
+               tree = sbi->attr_tree;
+       default:
+               return -EIO;
+       }
+
+       if (fork->total_size != cpu_to_be64(inode->i_size)) {
+               set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+               inode->i_sb->s_dirt = 1;
         }
-       return ret;
+       hfsplus_inode_write_fork(inode, fork);
+       if (tree)
+               hfs_btree_write(tree);
+       return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+               struct writeback_control *wbc)
+{
+       dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+       hfsplus_ext_write_extent(inode);
+
+       if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+           inode->i_ino == HFSPLUS_ROOT_CNID)
+               return hfsplus_cat_write_inode(inode);
+       else
+               return hfsplus_system_write_inode(inode);
  }
  
  static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
         truncate_inode_pages(&inode->i_data, 0);
         end_writeback(inode);
         if (HFSPLUS_IS_RSRC(inode)) {
-               HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
-               iput(HFSPLUS_I(inode).rsrc_inode);
+               HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+               iput(HFSPLUS_I(inode)->rsrc_inode);
         }
  }
  
  int hfsplus_sync_fs(struct super_block *sb, int wait)
  {
-       struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+       struct hfsplus_vh *vhdr = sbi->s_vhdr;
  
         dprint(DBG_SUPER, "hfsplus_write_super\n");
  
-       lock_super(sb);
+       mutex_lock(&sbi->vh_mutex);
+       mutex_lock(&sbi->alloc_mutex);
         sb->s_dirt = 0;
  
-       vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
-       vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
-       vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
-       vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
-       vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+       vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+       vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+       vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+       vhdr->file_count = cpu_to_be32(sbi->file_count);
  
-       mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-       if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
-               if (HFSPLUS_SB(sb).sect_count) {
+       mark_buffer_dirty(sbi->s_vhbh);
+       if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+               if (sbi->sect_count) {
                         struct buffer_head *bh;
                         u32 block, offset;
  
-                       block = HFSPLUS_SB(sb).blockoffset;
-                       block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
-                       offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
-                       printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
-                               HFSPLUS_SB(sb).sect_count, block, offset);
+                       block = sbi->blockoffset;
+                       block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+                       offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+                       printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+                                         sbi->blockoffset, sbi->sect_count,
+                                         block, offset);
                         bh = sb_bread(sb, block);
                         if (bh) {
                                 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
                                 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
-                                       memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+                                       memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
                                         mark_buffer_dirty(bh);
                                         brelse(bh);
                                 } else
                                         printk(KERN_WARNING "hfs: backup not found!\n");
                         }
                 }
-               HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
         }
-       unlock_super(sb);
+       mutex_unlock(&sbi->alloc_mutex);
+       mutex_unlock(&sbi->vh_mutex);
         return 0;
  }
  
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
  
  static void hfsplus_put_super(struct super_block *sb)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
         dprint(DBG_SUPER, "hfsplus_put_super\n");
+
         if (!sb->s_fs_info)
                 return;
  
-       lock_kernel();
-
         if (sb->s_dirt)
                 hfsplus_write_super(sb);
-       if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
-               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+       if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+               struct hfsplus_vh *vhdr = sbi->s_vhdr;
  
                 vhdr->modify_date = hfsp_now2mt();
                 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
-               mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-               sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+               mark_buffer_dirty(sbi->s_vhbh);
+               sync_dirty_buffer(sbi->s_vhbh);
         }
  
-       hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
-       hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
-       iput(HFSPLUS_SB(sb).alloc_file);
-       iput(HFSPLUS_SB(sb).hidden_dir);
-       brelse(HFSPLUS_SB(sb).s_vhbh);
-       unload_nls(HFSPLUS_SB(sb).nls);
+       hfs_btree_close(sbi->cat_tree);
+       hfs_btree_close(sbi->ext_tree);
+       iput(sbi->alloc_file);
+       iput(sbi->hidden_dir);
+       brelse(sbi->s_vhbh);
+       unload_nls(sbi->nls);
         kfree(sb->s_fs_info);
         sb->s_fs_info = NULL;
-
-       unlock_kernel();
  }
  
  static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
         struct super_block *sb = dentry->d_sb;
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
  
         buf->f_type = HFSPLUS_SUPER_MAGIC;
         buf->f_bsize = sb->s_blocksize;
-       buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
-       buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+       buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+       buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
         buf->f_bavail = buf->f_bfree;
         buf->f_files = 0xFFFFFFFF;
-       buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+       buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
         buf->f_fsid.val[0] = (u32)id;
         buf->f_fsid.val[1] = (u32)(id >> 32);
         buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
         if (!(*flags & MS_RDONLY)) {
-               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+               struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
                 struct hfsplus_sb_info sbi;
  
                 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
-               sbi.nls = HFSPLUS_SB(sb).nls;
+               sbi.nls = HFSPLUS_SB(sb)->nls;
                 if (!hfsplus_parse_options(data, &sbi))
                         return -EINVAL;
  
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
                                "running fsck.hfsplus is recommended.  leaving read-only.\n");
                         sb->s_flags |= MS_RDONLY;
                         *flags |= MS_RDONLY;
-               } else if (sbi.flags & HFSPLUS_SB_FORCE) {
+               } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
                         /* nothing */
                 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                         printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 return -ENOMEM;
  
         sb->s_fs_info = sbi;
-       INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+       mutex_init(&sbi->alloc_mutex);
+       mutex_init(&sbi->vh_mutex);
         hfsplus_fill_defaults(sbi);
         if (!hfsplus_parse_options(data, sbi)) {
                 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 err = -EINVAL;
                 goto cleanup;
         }
-       vhdr = HFSPLUS_SB(sb).s_vhdr;
+       vhdr = sbi->s_vhdr;
  
         /* Copy parts of the volume header into the superblock */
         sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 printk(KERN_ERR "hfs: wrong filesystem version\n");
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
-       HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
-       HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
-       HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
-       HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
-       HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
-       HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_SB(sb).data_clump_blocks)
-               HFSPLUS_SB(sb).data_clump_blocks = 1;
-       HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
-       if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
-               HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+       sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+       sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+       sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+       sbi->file_count = be32_to_cpu(vhdr->file_count);
+       sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+       sbi->data_clump_blocks =
+               be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+       if (!sbi->data_clump_blocks)
+               sbi->data_clump_blocks = 1;
+       sbi->rsrc_clump_blocks =
+               be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+       if (!sbi->rsrc_clump_blocks)
+               sbi->rsrc_clump_blocks = 1;
  
         /* Set up operations so we can load metadata */
         sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
                        "running fsck.hfsplus is recommended.  mounting read-only.\n");
                 sb->s_flags |= MS_RDONLY;
-       } else if (sbi->flags & HFSPLUS_SB_FORCE) {
+       } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
                 /* nothing */
         } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
                 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                        "use the force option at your own risk, mounting read-only.\n");
                 sb->s_flags |= MS_RDONLY;
         }
-       sbi->flags &= ~HFSPLUS_SB_FORCE;
  
         /* Load metadata objects (B*Trees) */
-       HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
-       if (!HFSPLUS_SB(sb).ext_tree) {
+       sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+       if (!sbi->ext_tree) {
                 printk(KERN_ERR "hfs: failed to load extents file\n");
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
-       if (!HFSPLUS_SB(sb).cat_tree) {
+       sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+       if (!sbi->cat_tree) {
                 printk(KERN_ERR "hfs: failed to load catalog file\n");
                 goto cleanup;
         }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                 err = PTR_ERR(inode);
                 goto cleanup;
         }
-       HFSPLUS_SB(sb).alloc_file = inode;
+       sbi->alloc_file = inode;
  
         /* Load the root directory */
         root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
  
         str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
         str.name = HFSP_HIDDENDIR_NAME;
-       hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+       hfs_find_init(sbi->cat_tree, &fd);
         hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
         if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
                 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                         err = PTR_ERR(inode);
                         goto cleanup;
                 }
-               HFSPLUS_SB(sb).hidden_dir = inode;
+               sbi->hidden_dir = inode;
         } else
                 hfs_find_exit(&fd);
  
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
         be32_add_cpu(&vhdr->write_count, 1);
         vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
         vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-       mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-       sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+       mark_buffer_dirty(sbi->s_vhbh);
+       sync_dirty_buffer(sbi->s_vhbh);
  
-       if (!HFSPLUS_SB(sb).hidden_dir) {
+       if (!sbi->hidden_dir) {
                 printk(KERN_DEBUG "hfs: create hidden dir...\n");
-               HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-               hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
-                                  &str, HFSPLUS_SB(sb).hidden_dir);
-               mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+
+               mutex_lock(&sbi->vh_mutex);
+               sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+               hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+                                  &str, sbi->hidden_dir);
+               mutex_unlock(&sbi->vh_mutex);
+
+               mark_inode_dirty(sbi->hidden_dir);
         }
  out:
         unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
  
  static void hfsplus_destroy_inode(struct inode *inode)
  {
-       kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+       kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
  }
  
  #define HFSPLUS_INODE_SIZE     sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c

index 628ccf6..b66d67d 100644 (file)
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
  int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
  {
         const hfsplus_unichr *ip;
-       struct nls_table *nls = HFSPLUS_SB(sb).nls;
+       struct nls_table *nls = HFSPLUS_SB(sb)->nls;
         u8 *op;
         u16 cc, c0, c1;
         u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
         ustrlen = be16_to_cpu(ustr->length);
         len = *len_p;
         ce1 = NULL;
-       compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
  
         while (ustrlen > 0) {
                 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
  static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
                               wchar_t *uc)
  {
-       int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+       int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
         if (size <= 0) {
                 *uc = '?';
                 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
         u16 *dstr, outlen = 0;
         wchar_t c;
  
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
                 size = asc2unichar(sb, astr, len, &c);
  
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
         wchar_t c;
         u16 c2;
  
-       casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         hash = init_name_hash();
         astr = str->name;
         len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
         u16 c1, c2;
         wchar_t c;
  
-       casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
-       decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+       casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+       decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
         astr1 = s1->name;
         len1 = s1->len;
         astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c

index bed78ac..8972c20 100644 (file)
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
         *start = 0;
         *size = sb->s_bdev->bd_inode->i_size >> 9;
  
-       if (HFSPLUS_SB(sb).session >= 0) {
-               te.cdte_track = HFSPLUS_SB(sb).session;
+       if (HFSPLUS_SB(sb)->session >= 0) {
+               te.cdte_track = HFSPLUS_SB(sb)->session;
                 te.cdte_format = CDROM_LBA;
                 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
                 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
  /* Takes in super block, returns true if good data read */
  int hfsplus_read_wrapper(struct super_block *sb)
  {
+       struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
         struct buffer_head *bh;
         struct hfsplus_vh *vhdr;
         struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
                 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
                         break;
                 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
-                       HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+                       set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
                         break;
                 }
                 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
         if (blocksize < HFSPLUS_SECTOR_SIZE ||
             ((blocksize - 1) & blocksize))
                 return -EINVAL;
-       HFSPLUS_SB(sb).alloc_blksz = blocksize;
-       HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+       sbi->alloc_blksz = blocksize;
+       sbi->alloc_blksz_shift = 0;
         while ((blocksize >>= 1) != 0)
-               HFSPLUS_SB(sb).alloc_blksz_shift++;
-       blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+               sbi->alloc_blksz_shift++;
+       blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
  
         /* align block size to block offset */
         while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
                 return -EINVAL;
         }
  
-       HFSPLUS_SB(sb).blockoffset = part_start >>
-                       (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
-       HFSPLUS_SB(sb).sect_count = part_size;
-       HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
-                       sb->s_blocksize_bits;
+       sbi->blockoffset =
+               part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+       sbi->sect_count = part_size;
+       sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
  
         bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
         if (!bh)
                 return -EIO;
  
         /* should still be the same... */
-       if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
-                               cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
-                               cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
-               goto error;
-       HFSPLUS_SB(sb).s_vhbh = bh;
-       HFSPLUS_SB(sb).s_vhdr = vhdr;
+       if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+               if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+                       goto error;
+       } else {
+               if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+                       goto error;
+       }
+
+       sbi->s_vhbh = bh;
+       sbi->s_vhdr = vhdr;
  
         return 0;
   error:
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h

index e2bd73e..f4d4120 100644 (file)
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
  #define move_pte(pte, prot, old_addr, new_addr)        (pte)
  #endif
  
+#ifndef flush_tlb_fix_spurious_fault
+#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
+#endif
+
  #ifndef pgprot_noncached
  #define pgprot_noncached(prot) (prot)
  #endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index ef2af99..f4229fb 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -687,7 +687,9 @@
                                 - LOAD_OFFSET) {                        \
                 VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                 *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                 *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                 *(.data..percpu)                                        \
                 *(.data..percpu..shared_aligned)                        \
                 VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
@@ -713,7 +715,9 @@
                 VMLINUX_SYMBOL(__per_cpu_load) = .;                     \
                 VMLINUX_SYMBOL(__per_cpu_start) = .;                    \
                 *(.data..percpu..first)                                 \
+               . = ALIGN(PAGE_SIZE);                                   \
                 *(.data..percpu..page_aligned)                          \
+               *(.data..percpu..readmostly)                            \
                 *(.data..percpu)                                        \
                 *(.data..percpu..shared_aligned)                        \
                 VMLINUX_SYMBOL(__per_cpu_end) = .;                      \
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h

index 7e3d285..1d0ef1a 100644 (file)
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,8 +25,6 @@ static inline u32 acpi_pm_read_early(void)
         return acpi_pm_read_verified() & ACPI_PM_MASK;
  }
  
-extern void pmtimer_wait(unsigned);
-
  #else
  
  static inline u32 acpi_pm_read_early(void)
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h

new file mode 100644 (file)

index 0000000..7fff521
--- /dev/null
+++ b/include/linux/ceph/auth.h
@@ -0,0 +1,92 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+       const char *name;
+
+       /*
+        * true if we are authenticated and can connect to
+        * services.
+        */
+       int (*is_authenticated)(struct ceph_auth_client *ac);
+
+       /*
+        * true if we should (re)authenticate, e.g., when our tickets
+        * are getting old and crusty.
+        */
+       int (*should_authenticate)(struct ceph_auth_client *ac);
+
+       /*
+        * build requests and process replies during monitor
+        * handshake.  if handle_reply returns -EAGAIN, we build
+        * another request.
+        */
+       int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+       int (*handle_reply)(struct ceph_auth_client *ac, int result,
+                           void *buf, void *end);
+
+       /*
+        * Create authorizer for connecting to a service, and verify
+        * the response to authenticate the service.
+        */
+       int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+                                struct ceph_authorizer **a,
+                                void **buf, size_t *len,
+                                void **reply_buf, size_t *reply_len);
+       int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+                                      struct ceph_authorizer *a, size_t len);
+       void (*destroy_authorizer)(struct ceph_auth_client *ac,
+                                  struct ceph_authorizer *a);
+       void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+                                     int peer_type);
+
+       /* reset when we (re)connect to a monitor */
+       void (*reset)(struct ceph_auth_client *ac);
+
+       void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+       u32 protocol;           /* CEPH_AUTH_* */
+       void *private;          /* for use by protocol implementation */
+       const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+       bool negotiating;       /* true if negotiating protocol */
+       const char *name;       /* entity name */
+       u64 global_id;          /* our unique id in system */
+       const char *secret;     /* our secret key */
+       unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+                                              const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+                                void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                                 void *buf, size_t len,
+                                 void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+                   void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+#endif
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h

new file mode 100644 (file)

index 0000000..58d1901
--- /dev/null
+++ b/include/linux/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+       struct kref kref;
+       struct kvec vec;
+       size_t alloc_len;
+       bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+       kref_get(&b->kref);
+       return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+       kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h

new file mode 100644 (file)

index 0000000..aa2e191
--- /dev/null
+++ b/include/linux/ceph/ceph_debug.h
@@ -0,0 +1,38 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)                                               \
+       pr_debug("%.*s %12.12s:%-4d : " fmt,                            \
+                8 - (int)sizeof(KBUILD_MODNAME), "    ",               \
+                ceph_file_part(__FILE__, sizeof(__FILE__)),            \
+                __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)       do {                            \
+               if (0)                                          \
+                       printk(KERN_DEBUG fmt, ##__VA_ARGS__);  \
+       } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)        pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h

new file mode 100644 (file)

index 0000000..5babb8e
--- /dev/null
+++ b/include/linux/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+       return (b << 24) |
+               (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+       return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+       return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+       return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+       return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+       return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+       /* is sub as specific as us, and contained by us? */
+       return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+              (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f) - 1,
+                        ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+       return ceph_frag_bits(f) > 0 &&
+               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+       return ceph_frag_bits(f) > 0 &&
+               (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f),
+                     ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f)+1,
+             ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+       int newbits = ceph_frag_bits(f) + by;
+       return ceph_frag_make(newbits,
+                        ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+       return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+       return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+       return ceph_frag_make(ceph_frag_bits(f),
+                        ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h

new file mode 100644 (file)

index 0000000..c3c74ae
--- /dev/null
+++ b/include/linux/ceph/ceph_fs.h
@@ -0,0 +1,729 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_UID            (1<<0)
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+       /* file -> object mapping */
+       __le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+                                     of page size. */
+       __le32 fl_stripe_count;    /* over this many objects */
+       __le32 fl_object_size;     /* until objects are this big, then move to
+                                     new objects */
+       __le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+       /* pg -> disk layout */
+       __le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+       /* object -> pg layout */
+       __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+       __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN      0x0
+#define CEPH_AUTH_NONE         0x1
+#define CEPH_AUTH_CEPHX                0x2
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH                  17
+#define CEPH_MSG_AUTH_REPLY            18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
+
+
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+/* pool operations */
+enum {
+  POOL_OP_CREATE                       = 0x01,
+  POOL_OP_DELETE                       = 0x02,
+  POOL_OP_AUID_CHANGE                  = 0x03,
+  POOL_OP_CREATE_SNAP                  = 0x11,
+  POOL_OP_DELETE_SNAP                  = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP                = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP                = 0x22,
+};
+
+struct ceph_mon_request_header {
+       __le64 have_version;
+       __le16 session_mon;
+       __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+       __le64 kb, kb_used, kb_avail;
+       __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+       struct ceph_fsid fsid;
+       __le64 version;
+       struct ceph_statfs st;
+} __attribute__ ((packed));
+
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 pool;
+       __le32 op;
+       __le64 auid;
+       __le64 snapid;
+       __le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 reply_code;
+       __le32 epoch;
+       char has_data;
+       char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+       __le64 snapid;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+       __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+       struct ceph_mon_request_header monhdr;
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+       struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+       __le64 have_version;    __le64 have;
+       __u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+       __le32 duration;         /* seconds */
+       struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+                                         empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+                                         operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION    1
+#define CEPH_LOCK_DN          2
+#define CEPH_LOCK_ISNAP       16
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
+#define CEPH_LOCK_IFILE       64
+#define CEPH_LOCK_IAUTH       128
+#define CEPH_LOCK_ILINK       256
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
+#define CEPH_LOCK_INEST       1024  /* mds internal */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+       CEPH_SESSION_REQUEST_OPEN,
+       CEPH_SESSION_OPEN,
+       CEPH_SESSION_REQUEST_CLOSE,
+       CEPH_SESSION_CLOSE,
+       CEPH_SESSION_REQUEST_RENEWCAPS,
+       CEPH_SESSION_RENEWCAPS,
+       CEPH_SESSION_STALE,
+       CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+       __le32 op;
+       __le64 seq;
+       struct ceph_timespec stamp;
+       __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+       CEPH_MDS_OP_LOOKUP     = 0x00100,
+       CEPH_MDS_OP_GETATTR    = 0x00101,
+       CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+       CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+       CEPH_MDS_OP_SETXATTR   = 0x01105,
+       CEPH_MDS_OP_RMXATTR    = 0x01106,
+       CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+       CEPH_MDS_OP_SETATTR    = 0x01108,
+       CEPH_MDS_OP_SETFILELOCK= 0x01109,
+       CEPH_MDS_OP_GETFILELOCK= 0x00110,
+       CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+       CEPH_MDS_OP_MKNOD      = 0x01201,
+       CEPH_MDS_OP_LINK       = 0x01202,
+       CEPH_MDS_OP_UNLINK     = 0x01203,
+       CEPH_MDS_OP_RENAME     = 0x01204,
+       CEPH_MDS_OP_MKDIR      = 0x01220,
+       CEPH_MDS_OP_RMDIR      = 0x01221,
+       CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+       CEPH_MDS_OP_CREATE     = 0x01301,
+       CEPH_MDS_OP_OPEN       = 0x00302,
+       CEPH_MDS_OP_READDIR    = 0x00305,
+
+       CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+       CEPH_MDS_OP_MKSNAP     = 0x01400,
+       CEPH_MDS_OP_RMSNAP     = 0x01401,
+       CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+       struct {
+               __le32 mask;                 /* CEPH_CAP_* */
+       } __attribute__ ((packed)) getattr;
+       struct {
+               __le32 mode;
+               __le32 uid;
+               __le32 gid;
+               struct ceph_timespec mtime;
+               struct ceph_timespec atime;
+               __le64 size, old_size;       /* old_size needed by truncate */
+               __le32 mask;                 /* CEPH_SETATTR_* */
+       } __attribute__ ((packed)) setattr;
+       struct {
+               __le32 frag;                 /* which dir fragment */
+               __le32 max_entries;          /* how many dentries to grab */
+               __le32 max_bytes;
+       } __attribute__ ((packed)) readdir;
+       struct {
+               __le32 mode;
+               __le32 rdev;
+       } __attribute__ ((packed)) mknod;
+       struct {
+               __le32 mode;
+       } __attribute__ ((packed)) mkdir;
+       struct {
+               __le32 flags;
+               __le32 mode;
+               __le32 stripe_unit;          /* layout for newly created file */
+               __le32 stripe_count;         /* ... */
+               __le32 object_size;
+               __le32 file_replication;
+               __le32 preferred;
+       } __attribute__ ((packed)) open;
+       struct {
+               __le32 flags;
+       } __attribute__ ((packed)) setxattr;
+       struct {
+               struct ceph_file_layout layout;
+       } __attribute__ ((packed)) setlayout;
+       struct {
+               __u8 rule; /* currently fcntl or flock */
+               __u8 type; /* shared, exclusive, remove*/
+               __le64 pid; /* process id requesting the lock */
+               __le64 pid_namespace;
+               __le64 start; /* initial location to lock */
+               __le64 length; /* num bytes to lock from start */
+               __u8 wait; /* will caller wait for lock to become available? */
+       } __attribute__ ((packed)) filelock_change;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+       __le64 oldest_client_tid;
+       __le32 mdsmap_epoch;           /* on client */
+       __le32 flags;                  /* CEPH_MDS_FLAG_* */
+       __u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+       __le16 num_releases;           /* # include cap/lease release records */
+       __le32 op;                     /* mds op code */
+       __le32 caller_uid, caller_gid;
+       __le64 ino;                    /* use this ino for openc, mkdir, mknod,
+                                         etc. (if replaying) */
+       union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+       __le64 ino, cap_id;            /* ino and unique cap id */
+       __le32 caps, wanted;           /* new issued, wanted */
+       __le32 seq, issue_seq, mseq;
+       __le32 dname_seq;              /* if releasing a dentry lease, a */
+       __le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+       __le32 op;
+       __le32 result;
+       __le32 mdsmap_epoch;
+       __u8 safe;                     /* true if committed to disk */
+       __u8 is_dentry, is_target;     /* true if dentry, target inode records
+                                         are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+       __le32 frag;                   /* this frag splits... */
+       __le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+       __le32 nsplits;                /* num ceph_frag_tree_split records */
+       struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+       __le32 caps, wanted;           /* caps issued, wanted */
+       __le64 cap_id;
+       __le32 seq, mseq;
+       __le64 realm;                  /* snap realm */
+       __u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+       __le64 ino;
+       __le64 snapid;
+       __le32 rdev;
+       __le64 version;                /* inode version */
+       __le64 xattr_version;          /* version for xattr blob */
+       struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+       struct ceph_file_layout layout;
+       struct ceph_timespec ctime, mtime, atime;
+       __le32 time_warp_seq;
+       __le64 size, max_size, truncate_size;
+       __le32 truncate_seq;
+       __le32 mode, uid, gid;
+       __le32 nlink;
+       __le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+       struct ceph_timespec rctime;
+       struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+       __le16 mask;            /* lease type(s) */
+       __le32 duration_ms;     /* lease duration */
+       __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+       __le32 frag;            /* fragment */
+       __le32 auth;            /* auth mds, if this is a delegation point */
+       __le32 ndist;           /* number of mds' this is replicated on */
+       __le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL    1
+#define CEPH_LOCK_FLOCK    2
+
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+
+struct ceph_filelock {
+       __le64 start;/* file offset to start lock at */
+       __le64 length; /* num bytes to lock; 0 for all following start */
+       __le64 client; /* which client holds the lock */
+       __le64 pid; /* process id holding the lock on the client */
+       __le64 pid_namespace;
+       __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8
+#define CEPH_CAP_SFLOCK    20 
+
+#define CEPH_CAP_BITS       22
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
+
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |                        \
+                                CEPH_CAP_AUTH_SHARED | \
+                                CEPH_CAP_LINK_SHARED | \
+                                CEPH_CAP_FILE_SHARED | \
+                                CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |                    \
+                             CEPH_CAP_LINK_SHARED |                    \
+                             CEPH_CAP_XATTR_SHARED |                   \
+                             CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |    \
+                          CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |                \
+                          CEPH_CAP_LINK_EXCL |         \
+                          CEPH_CAP_XATTR_EXCL |        \
+                          CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |        \
+                             CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+                          CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+                          CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+                       CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+       CEPH_CAP_OP_GRANT,         /* mds->client grant */
+       CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+       CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+       CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+       CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+       CEPH_CAP_OP_UPDATE,        /* client->mds update */
+       CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+       CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+       CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+       CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+       CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+       CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+       CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+       __le32 op;                  /* CEPH_CAP_OP_* */
+       __le64 ino, realm;
+       __le64 cap_id;
+       __le32 seq, issue_seq;
+       __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+       __le32 migrate_seq;
+       __le64 snap_follows;
+       __le32 snap_trace_len;
+
+       /* authlock */
+       __le32 uid, gid, mode;
+
+       /* linklock */
+       __le32 nlink;
+
+       /* xattrlock */
+       __le32 xattr_len;
+       __le64 xattr_version;
+
+       /* filelock */
+       __le64 size, max_size, truncate_size;
+       __le32 truncate_seq;
+       struct ceph_timespec mtime, atime, ctime;
+       struct ceph_file_layout layout;
+       __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+       __le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+       __le64 ino;
+       __le64 cap_id;
+       __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+       __u8 action;            /* CEPH_MDS_LEASE_* */
+       __le16 mask;            /* which lease */
+       __le64 ino;
+       __le64 first, last;     /* snap range */
+       __le32 seq;
+       __le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+       __le64 cap_id;
+       __le32 wanted;
+       __le32 issued;
+       __le64 snaprealm;
+       __le64 pathbase;        /* base ino for our path to this ino */
+       __le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+       __le64 cap_id;
+       __le32 wanted;
+       __le32 issued;
+       __le64 size;
+       struct ceph_timespec mtime, atime;
+       __le64 snaprealm;
+       __le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+       __le64 ino;     /* snap realm base */
+       __le64 seq;     /* snap seq for this snap realm */
+       __le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+       CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+       CEPH_SNAP_OP_CREATE,
+       CEPH_SNAP_OP_DESTROY,
+       CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+       __le32 op;                /* CEPH_SNAP_OP_* */
+       __le64 split;             /* ino to split off, if any */
+       __le32 num_split_inos;    /* # inos belonging to new child realm */
+       __le32 num_split_realms;  /* # child realms udner new child realm */
+       __le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+       __le64 ino;           /* ino */
+       __le64 created;       /* snap: when created */
+       __le64 parent;        /* ino: parent realm */
+       __le64 parent_since;  /* snap: same parent since */
+       __le64 seq;           /* snap: version */
+       __le32 num_snaps;
+       __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h

new file mode 100644 (file)

index 0000000..d099c3f
--- /dev/null
+++ b/include/linux/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h

new file mode 100644 (file)

index 0000000..2a79702
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include "ceph_debug.h"
+#include "types.h"
+
+#define CEPH_DEFINE_SHOW_FUNC(name)                                    \
+static int name##_open(struct inode *inode, struct file *file)         \
+{                                                                      \
+       struct seq_file *sf;                                            \
+       int ret;                                                        \
+                                                                       \
+       ret = single_open(file, name, NULL);                            \
+       sf = file->private_data;                                        \
+       sf->private = inode->i_private;                                 \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static const struct file_operations name##_fops = {                    \
+       .open           = name##_open,                                  \
+       .read           = seq_read,                                     \
+       .llseek         = seq_lseek,                                    \
+       .release        = single_release,                               \
+};
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h

new file mode 100644 (file)

index 0000000..c5b6939
--- /dev/null
+++ b/include/linux/ceph/decode.h
@@ -0,0 +1,201 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+#include <linux/time.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+       u64 v = get_unaligned_le64(*p);
+       *p += sizeof(u64);
+       return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+       u32 v = get_unaligned_le32(*p);
+       *p += sizeof(u32);
+       return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+       u16 v = get_unaligned_le16(*p);
+       *p += sizeof(u16);
+       return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+       u8 v = *(u8 *)*p;
+       (*p)++;
+       return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+       memcpy(pv, *p, n);
+       *p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)               \
+       do {                                            \
+               if (unlikely(*(p) + (n) > (end)))       \
+                       goto bad;                       \
+       } while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_decode_need(p, end, sizeof(u64), bad);     \
+               v = ceph_decode_64(p);                          \
+       } while (0)
+#define ceph_decode_32_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_decode_need(p, end, sizeof(u32), bad);     \
+               v = ceph_decode_32(p);                          \
+       } while (0)
+#define ceph_decode_16_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_decode_need(p, end, sizeof(u16), bad);     \
+               v = ceph_decode_16(p);                          \
+       } while (0)
+#define ceph_decode_8_safe(p, end, v, bad)                     \
+       do {                                                    \
+               ceph_decode_need(p, end, sizeof(u8), bad);      \
+               v = ceph_decode_8(p);                           \
+       } while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad)              \
+       do {                                                    \
+               ceph_decode_need(p, end, n, bad);               \
+               ceph_decode_copy(p, pv, n);                     \
+       } while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+                                       const struct ceph_timespec *tv)
+{
+       ts->tv_sec = le32_to_cpu(tv->tv_sec);
+       ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+                                       const struct timespec *ts)
+{
+       tv->tv_sec = cpu_to_le32(ts->tv_sec);
+       tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+       __be16 ss_family = htons(a->in_addr.ss_family);
+       a->in_addr.ss_family = *(__u16 *)&ss_family;
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+       __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+       a->in_addr.ss_family = ntohs(ss_family);
+       WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+       put_unaligned_le64(v, (__le64 *)*p);
+       *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+       put_unaligned_le32(v, (__le32 *)*p);
+       *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+       put_unaligned_le16(v, (__le16 *)*p);
+       *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+       *(u8 *)*p = v;
+       (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+       memcpy(*p, s, len);
+       *p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+                                       u64 ino, const char *path)
+{
+       u32 len = path ? strlen(path) : 0;
+       BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+       ceph_encode_8(p, 1);
+       ceph_encode_64(p, ino);
+       ceph_encode_32(p, len);
+       if (len)
+               memcpy(*p, path, len);
+       *p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+                                     const char *s, u32 len)
+{
+       BUG_ON(*p + sizeof(len) + len > end);
+       ceph_encode_32(p, len);
+       if (len)
+               memcpy(*p, s, len);
+       *p += len;
+}
+
+#define ceph_encode_need(p, end, n, bad)               \
+       do {                                            \
+               if (unlikely(*(p) + (n) > (end)))       \
+                       goto bad;                       \
+       } while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_encode_need(p, end, sizeof(u64), bad);     \
+               ceph_encode_64(p, v);                           \
+       } while (0)
+#define ceph_encode_32_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_encode_need(p, end, sizeof(u32), bad);     \
+               ceph_encode_32(p, v);                   \
+       } while (0)
+#define ceph_encode_16_safe(p, end, v, bad)                    \
+       do {                                                    \
+               ceph_encode_need(p, end, sizeof(u16), bad);     \
+               ceph_encode_16(p, v);                   \
+       } while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad)              \
+       do {                                                    \
+               ceph_encode_need(p, end, n, bad);               \
+               ceph_encode_copy(p, pv, n);                     \
+       } while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad)             \
+       do {                                                    \
+               ceph_encode_need(p, end, n, bad);               \
+               ceph_encode_string(p, end, s, n);               \
+       } while (0)
+
+
+#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h

new file mode 100644 (file)

index 0000000..f22b2e9
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_DEFAULT  CEPH_FEATURE_NOSRCADDR
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+
+#define CEPH_OPT_DEFAULT   (0);
+
+#define ceph_set_opt(client, opt) \
+       (client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+       (!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+       int flags;
+       struct ceph_fsid fsid;
+       struct ceph_entity_addr my_addr;
+       int mount_timeout;
+       int osd_idle_ttl;
+       int osd_timeout;
+       int osd_keepalive_timeout;
+
+       /*
+        * any type that can't be simply compared or doesn't need need
+        * to be compared should go beyond this point,
+        * ceph_compare_options() should be updated accordingly
+        */
+
+       struct ceph_entity_addr *mon_addr; /* should be the first
+                                             pointer type of args */
+       int num_mon;
+       char *name;
+       char *secret;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN  (16*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
+
+/* mount state */
+enum {
+       CEPH_MOUNT_MOUNTING,
+       CEPH_MOUNT_MOUNTED,
+       CEPH_MOUNT_UNMOUNTING,
+       CEPH_MOUNT_UNMOUNTED,
+       CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+       BUG_ON(time_after(b, a));
+       return (long)a - (long)b;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+       struct ceph_fsid fsid;
+       bool have_fsid;
+
+       void *private;
+
+       struct ceph_options *options;
+
+       struct mutex mount_mutex;      /* serialize mount attempts */
+       wait_queue_head_t auth_wq;
+       int auth_err;
+
+       int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+       u32 supported_features;
+       u32 required_features;
+
+       struct ceph_messenger *msgr;   /* messenger instance */
+       struct ceph_mon_client monc;
+       struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_dir;
+       struct dentry *debugfs_monmap;
+       struct dentry *debugfs_osdmap;
+#endif
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+       atomic_t nref;
+       u64 seq;
+       int num_snaps;
+       u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+       /*
+       printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+              atomic_read(&sc->nref)+1);
+       */
+       if (sc)
+               atomic_inc(&sc->nref);
+       return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+       if (!sc)
+               return;
+       /*
+       printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+              atomic_read(&sc->nref)-1);
+       */
+       if (atomic_dec_and_test(&sc->nref)) {
+               /*printk(" deleting snap_context %p\n", sc);*/
+               kfree(sc);
+       }
+}
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+       return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+               (off >> PAGE_CACHE_SHIFT);
+}
+
+/* ceph_common.c */
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern int ceph_parse_options(struct ceph_options **popt, char *options,
+                             const char *dev_name, const char *dev_name_end,
+                             int (*parse_extra_token)(char *c, void *private),
+                             void *private);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+                               struct ceph_client *client);
+extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
+                                             void *private);
+extern u64 ceph_client_id(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+                              unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+extern struct page **ceph_get_direct_page_vector(const char __user *data,
+                                           int num_pages,
+                                           loff_t off, size_t len);
+extern void ceph_put_page_vector(struct page **pages, int num_pages);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+                                        const char __user *data,
+                                        loff_t off, size_t len);
+extern int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len);
+extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
+                                   loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h

new file mode 100644 (file)

index 0000000..4c5cb08
--- /dev/null
+++ b/include/linux/ceph/mdsmap.h
@@ -0,0 +1,62 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+       u64 global_id;
+       struct ceph_entity_addr addr;
+       s32 state;
+       int num_export_targets;
+       bool laggy;
+       u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+       u32 m_epoch, m_client_epoch, m_last_failure;
+       u32 m_root;
+       u32 m_session_timeout;          /* seconds */
+       u32 m_session_autoclose;        /* seconds */
+       u64 m_max_file_size;
+       u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+       struct ceph_mds_info *m_info;
+
+       /* which object pools file data can be stored in */
+       int m_num_data_pg_pools;
+       u32 *m_data_pg_pools;
+       u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+       if (w >= m->m_max_mds)
+               return NULL;
+       return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+       BUG_ON(w < 0);
+       if (w >= m->m_max_mds)
+               return CEPH_MDS_STATE_DNE;
+       return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+       if (w >= 0 && w < m->m_max_mds)
+               return m->m_info[w].laggy;
+       return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h

new file mode 100644 (file)

index 0000000..5956d62
--- /dev/null
+++ b/include/linux/ceph/messenger.h
@@ -0,0 +1,261 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+       struct ceph_connection *(*get)(struct ceph_connection *);
+       void (*put)(struct ceph_connection *);
+
+       /* handle an incoming message. */
+       void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+       /* authorize an outgoing connection */
+       int (*get_authorizer) (struct ceph_connection *con,
+                              void **buf, int *len, int *proto,
+                              void **reply_buf, int *reply_len, int force_new);
+       int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+       int (*invalidate_authorizer)(struct ceph_connection *con);
+
+       /* protocol version mismatch */
+       void (*bad_proto) (struct ceph_connection *con);
+
+       /* there was some error on the socket (disconnect, whatever) */
+       void (*fault) (struct ceph_connection *con);
+
+       /* a remote host as terminated a message exchange session, and messages
+        * we sent (or they tried to send us) may be lost. */
+       void (*peer_reset) (struct ceph_connection *con);
+
+       struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+                                       struct ceph_msg_header *hdr,
+                                       int *skip);
+};
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+       struct ceph_entity_inst inst;    /* my name+address */
+       struct ceph_entity_addr my_enc_addr;
+       struct page *zero_page;          /* used in certain error cases */
+
+       bool nocrc;
+
+       /*
+        * the global_seq counts connections i (attempt to) initiate
+        * in order to disambiguate certain connect race conditions.
+        */
+       u32 global_seq;
+       spinlock_t global_seq_lock;
+
+       u32 supported_features;
+       u32 required_features;
+};
+
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+       struct ceph_msg_header hdr;     /* header */
+       struct ceph_msg_footer footer;  /* footer */
+       struct kvec front;              /* unaligned blobs of message */
+       struct ceph_buffer *middle;
+       struct page **pages;            /* data payload.  NOT OWNER. */
+       unsigned nr_pages;              /* size of page array */
+       struct ceph_pagelist *pagelist; /* instead of pages */
+       struct list_head list_head;
+       struct kref kref;
+       struct bio  *bio;               /* instead of pages/pagelist */
+       struct bio  *bio_iter;          /* bio iterator */
+       int bio_seg;                    /* current bio segment */
+       struct ceph_pagelist *trail;    /* the trailing part of the data */
+       bool front_is_vmalloc;
+       bool more_to_follow;
+       bool needs_out_seq;
+       int front_max;
+
+       struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+       int page, page_pos;  /* which page; offset in page */
+       int data_pos;        /* offset in data payload */
+       int did_page_crc;    /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL    (HZ/2)
+#define MAX_DELAY_INTERVAL     (5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING     1
+#define NEGOTIATING    2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING  4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY                8  /* no outgoing messages, socket closed.  we keep
+                           * the ceph_connection around to maintain shared
+                           * state with the peer. */
+#define CLOSED         10 /* we've closed the connection */
+#define SOCK_CLOSED    11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+       void *private;
+       atomic_t nref;
+
+       const struct ceph_connection_operations *ops;
+
+       struct ceph_messenger *msgr;
+       struct socket *sock;
+       unsigned long state;    /* connection state (see flags above) */
+       const char *error_msg;  /* error message, if any */
+
+       struct ceph_entity_addr peer_addr; /* peer address */
+       struct ceph_entity_name peer_name; /* peer name */
+       struct ceph_entity_addr peer_addr_for_me;
+       unsigned peer_features;
+       u32 connect_seq;      /* identify the most recent connection
+                                attempt for this connection, client */
+       u32 peer_global_seq;  /* peer's global seq for this connection */
+
+       int auth_retry;       /* true if we need a newer authorizer */
+       void *auth_reply_buf;   /* where to put the authorizer reply */
+       int auth_reply_buf_len;
+
+       struct mutex mutex;
+
+       /* out queue */
+       struct list_head out_queue;
+       struct list_head out_sent;   /* sending or sent but unacked */
+       u64 out_seq;                 /* last message queued for send */
+       bool out_keepalive_pending;
+
+       u64 in_seq, in_seq_acked;  /* last message received, acked */
+
+       /* connection negotiation temps */
+       char in_banner[CEPH_BANNER_MAX_LEN];
+       union {
+               struct {  /* outgoing connection */
+                       struct ceph_msg_connect out_connect;
+                       struct ceph_msg_connect_reply in_reply;
+               };
+               struct {  /* incoming */
+                       struct ceph_msg_connect in_connect;
+                       struct ceph_msg_connect_reply out_reply;
+               };
+       };
+       struct ceph_entity_addr actual_peer_addr;
+
+       /* message out temps */
+       struct ceph_msg *out_msg;        /* sending message (== tail of
+                                           out_sent) */
+       bool out_msg_done;
+       struct ceph_msg_pos out_msg_pos;
+
+       struct kvec out_kvec[8],         /* sending header/footer data */
+               *out_kvec_cur;
+       int out_kvec_left;   /* kvec's left in out_kvec */
+       int out_skip;        /* skip this many bytes */
+       int out_kvec_bytes;  /* total bytes left */
+       bool out_kvec_is_msg; /* kvec refers to out_msg */
+       int out_more;        /* there is more data after the kvecs */
+       __le64 out_temp_ack; /* for writing an ack */
+
+       /* message in temps */
+       struct ceph_msg_header in_hdr;
+       struct ceph_msg *in_msg;
+       struct ceph_msg_pos in_msg_pos;
+       u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+
+       char in_tag;         /* protocol control byte */
+       int in_base_pos;     /* bytes read */
+       __le64 in_temp_ack;  /* for reading an ack */
+
+       struct delayed_work work;           /* send|recv work */
+       unsigned long       delay;          /* current delay interval */
+};
+
+
+extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+                         struct ceph_entity_addr *addr,
+                         int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+       struct ceph_entity_addr *myaddr,
+       u32 features, u32 required);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+                         struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+                         struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+                                 struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+       kref_get(&msg->kref);
+       return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+       kref_put(&msg->kref, ceph_msg_last_put);
+}
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h

new file mode 100644 (file)

index 0000000..545f859
--- /dev/null
+++ b/include/linux/ceph/mon_client.h
@@ -0,0 +1,122 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+
+#include "messenger.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+       struct ceph_fsid fsid;
+       u32 epoch;
+       u32 num_mon;
+       struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_generic_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+                                        int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+       struct ceph_mon_client *monc;
+       struct delayed_work delayed_work;
+       unsigned long delay;
+       ceph_monc_request_func_t do_request;
+};
+
+/*
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_generic_request {
+       struct kref kref;
+       u64 tid;
+       struct rb_node node;
+       int result;
+       void *buf;
+       int buf_len;
+       struct completion completion;
+       struct ceph_msg *request;  /* original request */
+       struct ceph_msg *reply;    /* and reply */
+};
+
+struct ceph_mon_client {
+       struct ceph_client *client;
+       struct ceph_monmap *monmap;
+
+       struct mutex mutex;
+       struct delayed_work delayed_work;
+
+       struct ceph_auth_client *auth;
+       struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
+       int pending_auth;
+
+       bool hunting;
+       int cur_mon;                       /* last monitor i contacted */
+       unsigned long sub_sent, sub_renew_after;
+       struct ceph_connection *con;
+       bool have_fsid;
+
+       /* pending generic requests */
+       struct rb_root generic_request_tree;
+       int num_generic_requests;
+       u64 last_tid;
+
+       /* mds/osd map */
+       int want_mdsmap;
+       int want_next_osdmap; /* 1 = want, 2 = want+asked */
+       u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+                               struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+                              struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                                  u32 pool, u64 *snapid);
+
+extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                                  u32 pool, u64 snapid);
+
+#endif
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h

new file mode 100644 (file)

index 0000000..a362605
--- /dev/null
+++ b/include/linux/ceph/msgpool.h
@@ -0,0 +1,25 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include <linux/mempool.h>
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+       const char *name;
+       mempool_t *pool;
+       int front_len;          /* preallocated payload size */
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+                            int front_len, int size, bool blocking,
+                            const char *name);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+                                        int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h

new file mode 100644 (file)

index 0000000..680d3d6
--- /dev/null
+++ b/include/linux/ceph/msgr.h
@@ -0,0 +1,175 @@
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT    6789  /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+       __u8 type;      /* CEPH_ENTITY_TYPE_* */
+       __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+       __le32 type;
+       __le32 nonce;  /* unique id for process (e.g. pid) */
+       struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+       struct ceph_entity_name name;
+       struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+                                         incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+                                         with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+                                         with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+       __le64 features;     /* supported feature bits */
+       __le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+       __le32 global_seq;   /* count connections initiated by this host */
+       __le32 connect_seq;  /* count connections initiated in this session */
+       __le32 protocol_version;
+       __le32 authorizer_protocol;
+       __le32 authorizer_len;
+       __u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+       __u8 tag;
+       __le64 features;     /* feature bits for this session */
+       __le32 global_seq;
+       __le32 connect_seq;
+       __le32 protocol_version;
+       __le32 authorizer_len;
+       __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+       __le64 seq;       /* message seq# for this session */
+       __le64 tid;       /* transaction id */
+       __le16 type;      /* message type */
+       __le16 priority;  /* priority.  higher value == higher priority */
+       __le16 version;   /* version of message encoding */
+
+       __le32 front_len; /* bytes in main payload */
+       __le32 middle_len;/* bytes in middle payload */
+       __le32 data_len;  /* bytes of data payload */
+       __le16 data_off;  /* sender: include full offset;
+                            receiver: mask against ~PAGE_MASK */
+
+       struct ceph_entity_inst src, orig_src;
+       __le32 reserved;
+       __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+       __le64 seq;       /* message seq# for this session */
+       __le64 tid;       /* transaction id */
+       __le16 type;      /* message type */
+       __le16 priority;  /* priority.  higher value == higher priority */
+       __le16 version;   /* version of message encoding */
+
+       __le32 front_len; /* bytes in main payload */
+       __le32 middle_len;/* bytes in middle payload */
+       __le32 data_len;  /* bytes of data payload */
+       __le16 data_off;  /* sender: include full offset;
+                            receiver: mask against ~PAGE_MASK */
+
+       struct ceph_entity_name src;
+       __le32 reserved;
+       __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+       __le32 front_crc, middle_crc, data_crc;
+       __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+
+
+#endif
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h

new file mode 100644 (file)

index 0000000..6c91fb0
--- /dev/null
+++ b/include/linux/ceph/osd_client.h
@@ -0,0 +1,234 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+struct ceph_pagelist;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+                                    struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+       atomic_t o_ref;
+       struct ceph_osd_client *o_osdc;
+       int o_osd;
+       int o_incarnation;
+       struct rb_node o_node;
+       struct ceph_connection o_con;
+       struct list_head o_requests;
+       struct list_head o_osd_lru;
+       struct ceph_authorizer *o_authorizer;
+       void *o_authorizer_buf, *o_authorizer_reply_buf;
+       size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+       unsigned long lru_ttl;
+       int o_marked_for_keepalive;
+       struct list_head o_keepalive_item;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+       u64             r_tid;              /* unique for this client */
+       struct rb_node  r_node;
+       struct list_head r_req_lru_item;
+       struct list_head r_osd_item;
+       struct ceph_osd *r_osd;
+       struct ceph_pg   r_pgid;
+       int              r_pg_osds[CEPH_PG_MAX_SIZE];
+       int              r_num_pg_osds;
+
+       struct ceph_connection *r_con_filling_msg;
+
+       struct ceph_msg  *r_request, *r_reply;
+       int               r_result;
+       int               r_flags;     /* any additional flags for the osd */
+       u32               r_sent;      /* >0 if r_request is sending/sent */
+       int               r_got_reply;
+
+       struct ceph_osd_client *r_osdc;
+       struct kref       r_kref;
+       bool              r_mempool;
+       struct completion r_completion, r_safe_completion;
+       ceph_osdc_callback_t r_callback, r_safe_callback;
+       struct ceph_eversion r_reassert_version;
+       struct list_head  r_unsafe_item;
+
+       struct inode *r_inode;                /* for use by callbacks */
+       void *r_priv;                         /* ditto */
+
+       char              r_oid[40];          /* object name */
+       int               r_oid_len;
+       unsigned long     r_stamp;            /* send OR check time */
+       bool              r_resend;           /* msg send failed, needs retry */
+
+       struct ceph_file_layout r_file_layout;
+       struct ceph_snap_context *r_snapc;    /* snap context for writes */
+       unsigned          r_num_pages;        /* size of page array (follows) */
+       struct page     **r_pages;            /* pages for data payload */
+       int               r_pages_from_pool;
+       int               r_own_pages;        /* if true, i own page list */
+#ifdef CONFIG_BLOCK
+       struct bio       *r_bio;              /* instead of pages */
+#endif
+
+       struct ceph_pagelist *r_trail;        /* trailing part of the data */
+};
+
+struct ceph_osd_client {
+       struct ceph_client     *client;
+
+       struct ceph_osdmap     *osdmap;       /* current map */
+       struct rw_semaphore    map_sem;
+       struct completion      map_waiters;
+       u64                    last_requested_map;
+
+       struct mutex           request_mutex;
+       struct rb_root         osds;          /* osds */
+       struct list_head       osd_lru;       /* idle osds */
+       u64                    timeout_tid;   /* tid of timeout triggering rq */
+       u64                    last_tid;      /* tid of last request */
+       struct rb_root         requests;      /* pending requests */
+       struct list_head       req_lru;       /* pending requests lru */
+       int                    num_requests;
+       struct delayed_work    timeout_work;
+       struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+       struct dentry          *debugfs_file;
+#endif
+
+       mempool_t              *req_mempool;
+
+       struct ceph_msgpool     msgpool_op;
+       struct ceph_msgpool     msgpool_op_reply;
+};
+
+struct ceph_osd_req_op {
+       u16 op;           /* CEPH_OSD_OP_* */
+       u32 flags;        /* CEPH_OSD_FLAG_* */
+       union {
+               struct {
+                       u64 offset, length;
+                       u64 truncate_size;
+                       u32 truncate_seq;
+               } extent;
+               struct {
+                       const char *name;
+                       u32 name_len;
+                       const char  *val;
+                       u32 value_len;
+                       __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                       __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+               } xattr;
+               struct {
+                       const char *class_name;
+                       __u8 class_len;
+                       const char *method_name;
+                       __u8 method_len;
+                       __u8 argc;
+                       const char *indata;
+                       u32 indata_len;
+               } cls;
+               struct {
+                       u64 cookie, count;
+               } pgls;
+               struct {
+                       u64 snapid;
+               } snap;
+       };
+       u32 payload_len;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+                         struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+                                  struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+                                struct ceph_msg *msg);
+
+extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+                       struct ceph_file_layout *layout,
+                       u64 snapid,
+                       u64 off, u64 *plen, u64 *bno,
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+                                              int flags,
+                                              struct ceph_snap_context *snapc,
+                                              struct ceph_osd_req_op *ops,
+                                              bool use_mempool,
+                                              gfp_t gfp_flags,
+                                              struct page **pages,
+                                              struct bio *bio);
+
+extern void ceph_osdc_build_request(struct ceph_osd_request *req,
+                                   u64 off, u64 *plen,
+                                   struct ceph_osd_req_op *src_ops,
+                                   struct ceph_snap_context *snapc,
+                                   struct timespec *mtime,
+                                   const char *oid,
+                                   int oid_len);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+                                     struct ceph_file_layout *layout,
+                                     struct ceph_vino vino,
+                                     u64 offset, u64 *len, int op, int flags,
+                                     struct ceph_snap_context *snapc,
+                                     int do_sync, u32 truncate_seq,
+                                     u64 truncate_size,
+                                     struct timespec *mtime,
+                                     bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+       kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+       kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                                  struct ceph_osd_request *req,
+                                  bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                                 struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                              struct ceph_vino vino,
+                              struct ceph_file_layout *layout,
+                              u64 off, u64 *plen,
+                              u32 truncate_seq, u64 truncate_size,
+                              struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+                               struct ceph_vino vino,
+                               struct ceph_file_layout *layout,
+                               struct ceph_snap_context *sc,
+                               u64 off, u64 len,
+                               u32 truncate_seq, u64 truncate_size,
+                               struct timespec *mtime,
+                               struct page **pages, int nr_pages,
+                               int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h

new file mode 100644 (file)

index 0000000..ba4c205
--- /dev/null
+++ b/include/linux/ceph/osdmap.h
@@ -0,0 +1,130 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include <linux/crush/crush.h>
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+       struct rb_node node;
+       int id;
+       struct ceph_pg_pool v;
+       int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+       char *name;
+};
+
+struct ceph_pg_mapping {
+       struct rb_node node;
+       struct ceph_pg pgid;
+       int len;
+       int osds[];
+};
+
+struct ceph_osdmap {
+       struct ceph_fsid fsid;
+       u32 epoch;
+       u32 mkfs_epoch;
+       struct ceph_timespec created, modified;
+
+       u32 flags;         /* CEPH_OSDMAP_* */
+
+       u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+       u8 *osd_state;     /* CEPH_OSD_* */
+       u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+       struct ceph_entity_addr *osd_addr;
+
+       struct rb_root pg_temp;
+       struct rb_root pg_pools;
+       u32 pool_max;
+
+       /* the CRUSH map specifies the mapping of placement groups to
+        * the list of osds that store+replicate them. */
+       struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+       ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+       ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+       ((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+       ((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+       return le32_to_cpu(l->fl_stripe_unit) *
+               le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+       return le32_to_cpu(l->fl_object_size) *
+               le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+       return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+       return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+                                                    int osd)
+{
+       if (osd >= map->max_osd)
+               return NULL;
+       return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                           struct ceph_osdmap *map,
+                                           struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                         u64 off, u64 *plen,
+                                         u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                                  const char *oid,
+                                  struct ceph_file_layout *fl,
+                                  struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                              int *acting);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+                               struct ceph_pg pgid);
+
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+
+#endif
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h

new file mode 100644 (file)

index 0000000..9660d6b
--- /dev/null
+++ b/include/linux/ceph/pagelist.h
@@ -0,0 +1,75 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+       struct list_head head;
+       void *mapped_tail;
+       size_t length;
+       size_t room;
+       struct list_head free_list;
+       size_t num_pages_free;
+};
+
+struct ceph_pagelist_cursor {
+       struct ceph_pagelist *pl;   /* pagelist, for error checking */
+       struct list_head *page_lru; /* page in list */
+       size_t room;                /* room remaining to reset to */
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+       INIT_LIST_HEAD(&pl->head);
+       pl->mapped_tail = NULL;
+       pl->length = 0;
+       pl->room = 0;
+       INIT_LIST_HEAD(&pl->free_list);
+       pl->num_pages_free = 0;
+}
+
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
+
+extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
+
+extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+                                    struct ceph_pagelist_cursor *c);
+
+extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+                                 struct ceph_pagelist_cursor *c);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+       __le64 ev = cpu_to_le64(v);
+       return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+       __le32 ev = cpu_to_le32(v);
+       return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+       __le16 ev = cpu_to_le16(v);
+       return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+       return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+                                             char *s, size_t len)
+{
+       int ret = ceph_pagelist_encode_32(pl, len);
+       if (ret)
+               return ret;
+       if (len)
+               return ceph_pagelist_append(pl, s, len);
+       return 0;
+}
+
+#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h

new file mode 100644 (file)

index 0000000..6d5247f
--- /dev/null
+++ b/include/linux/ceph/rados.h
@@ -0,0 +1,405 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+       unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+                                   const struct ceph_fsid *b)
+{
+       return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+       __le32 tv_sec;
+       __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+       __le16 preferred; /* preferred primary osd */
+       __le16 ps;        /* placement seed */
+       __le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+       __u8 type;                /* CEPH_PG_TYPE_* */
+       __u8 size;                /* number of osds in each pg */
+       __u8 crush_ruleset;       /* crush placement rule */
+       __u8 object_hash;         /* hash mapping object name to ps */
+       __le32 pg_num, pgp_num;   /* number of pg's */
+       __le32 lpg_num, lpgp_num; /* number of localized pg's */
+       __le32 last_change;       /* most recent epoch changed */
+       __le64 snap_seq;          /* seq for per-pool snapshot */
+       __le32 snap_epoch;        /* epoch of last snap */
+       __le32 num_snaps;
+       __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+       __le64 auid;               /* who owns the pg */
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+       if ((x & bmask) < b)
+               return x & bmask;
+       else
+               return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+       struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+       __le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+       __le32 epoch;
+       __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+       /** data **/
+       /* read */
+       CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+       CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+       /* fancy read */
+       CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+       /* write */
+       CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+       CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+       CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+       CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+       CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+       /* fancy write */
+       CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+       CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+       CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+       CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+       CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+       CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+       CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+       CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+       CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
+
+       /** attrs **/
+       /* read */
+       CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+       CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+       CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
+
+       /* write */
+       CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+       CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+       CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+       CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+       /** subop **/
+       CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+       CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+       CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+       CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+       CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+       /** lock **/
+       CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+       CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+       CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+       CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+       CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+       CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+       /** exec **/
+       CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+       /** pg **/
+       CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+       return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+       return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+       CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+       CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+       CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+       CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+       CEPH_OSD_FLAG_READ = 16,        /* op may read */
+       CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+       CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+       CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+       CEPH_OSD_FLAG_BALANCE_READS = 256,
+       CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+       CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+       CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+       CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
+};
+
+enum {
+       CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/* xattr comparison */
+enum {
+       CEPH_OSD_CMPXATTR_OP_NOP = 0,
+       CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+       CEPH_OSD_CMPXATTR_OP_NE  = 2,
+       CEPH_OSD_CMPXATTR_OP_GT  = 3,
+       CEPH_OSD_CMPXATTR_OP_GTE = 4,
+       CEPH_OSD_CMPXATTR_OP_LT  = 5,
+       CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+       CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+       CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+       __le16 op;           /* CEPH_OSD_OP_* */
+       __le32 flags;        /* CEPH_OSD_FLAG_* */
+       union {
+               struct {
+                       __le64 offset, length;
+                       __le64 truncate_size;
+                       __le32 truncate_seq;
+               } __attribute__ ((packed)) extent;
+               struct {
+                       __le32 name_len;
+                       __le32 value_len;
+                       __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                       __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
+               } __attribute__ ((packed)) xattr;
+               struct {
+                       __u8 class_len;
+                       __u8 method_len;
+                       __u8 argc;
+                       __le32 indata_len;
+               } __attribute__ ((packed)) cls;
+               struct {
+                       __le64 cookie, count;
+               } __attribute__ ((packed)) pgls;
+               struct {
+                       __le64 snapid;
+               } __attribute__ ((packed)) snap;
+       };
+       __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+       __le32 client_inc;                 /* client incarnation */
+       struct ceph_object_layout layout;  /* pgid */
+       __le32 osdmap_epoch;               /* client's osdmap epoch */
+
+       __le32 flags;
+
+       struct ceph_timespec mtime;        /* for mutations only */
+       struct ceph_eversion reassert_version; /* if we are replaying op */
+
+       __le32 object_len;     /* length of object name */
+
+       __le64 snapid;         /* snapid to read */
+       __le64 snap_seq;       /* writer's snap context */
+       __le32 num_snaps;
+
+       __le16 num_ops;
+       struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+       __le32 client_inc;                /* client incarnation */
+       __le32 flags;
+       struct ceph_object_layout layout;
+       __le32 osdmap_epoch;
+       struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+       __le32 result;                    /* result code */
+
+       __le32 object_len;                /* length of object name */
+       __le32 num_ops;
+       struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h

new file mode 100644 (file)

index 0000000..28b35a0
--- /dev/null
+++ b/include/linux/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+       u64 ino;
+       u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+       int count;
+};
+
+
+#endif
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 0c99102..709dfb9 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
  
         unsigned long flags;
         /* ID for this css, if possible */
-       struct css_id *id;
+       struct css_id __rcu *id;
  };
  
  /* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
         struct list_head children;      /* my children */
  
         struct cgroup *parent;          /* my parent */
-       struct dentry *dentry;          /* cgroup fs entry, RCU protected */
+       struct dentry __rcu *dentry;    /* cgroup fs entry, RCU protected */
  
         /* Private pointers for each registered subsystem */
         struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h

index c1a62c5..320d6c9 100644 (file)
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
  # define __release(x)  __context__(x,-1)
  # define __cond_lock(x,c)      ((c) ? ({ __acquire(x); 1; }) : 0)
  # define __percpu      __attribute__((noderef, address_space(3)))
+#ifdef CONFIG_SPARSE_RCU_POINTER
+# define __rcu         __attribute__((noderef, address_space(4)))
+#else
  # define __rcu
+#endif
  extern void __chk_user_ptr(const volatile void __user *);
  extern void __chk_io_ptr(const volatile void __iomem *);
  #else
diff --git a/include/linux/cred.h b/include/linux/cred.h

index 4d2c395..4aaeab3 100644 (file)
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
         atomic_t        usage;
         pid_t           tgid;                   /* thread group process ID */
         spinlock_t      lock;
-       struct key      *session_keyring;       /* keyring inherited over fork */
+       struct key __rcu *session_keyring;      /* keyring inherited over fork */
         struct key      *process_keyring;       /* keyring private to this process */
         struct rcu_head rcu;                    /* RCU deletion hook */
  };
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h

new file mode 100644 (file)

index 0000000..97e435b
--- /dev/null
+++ b/include/linux/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef CEPH_CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+       __u32 op;
+       __s32 arg1;
+       __s32 arg2;
+};
+
+/* step op codes */
+enum {
+       CRUSH_RULE_NOOP = 0,
+       CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+       CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+                                     /* arg2 = type */
+       CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+       CRUSH_RULE_EMIT = 4,          /* no args */
+       CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+       CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+       __u8 ruleset;
+       __u8 type;
+       __u8 min_size;
+       __u8 max_size;
+};
+
+struct crush_rule {
+       __u32 len;
+       struct crush_rule_mask mask;
+       struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+                             (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+       CRUSH_BUCKET_UNIFORM = 1,
+       CRUSH_BUCKET_LIST = 2,
+       CRUSH_BUCKET_TREE = 3,
+       CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+       __s32 id;        /* this'll be negative */
+       __u16 type;      /* non-zero; type=0 is reserved for devices */
+       __u8 alg;        /* one of CRUSH_BUCKET_* */
+       __u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+       __u32 weight;    /* 16-bit fixed point */
+       __u32 size;      /* num items */
+       __s32 *items;
+
+       /*
+        * cached random permutation: used for uniform bucket and for
+        * the linear search fallback for the other bucket types.
+        */
+       __u32 perm_x;  /* @x for which *perm is defined */
+       __u32 perm_n;  /* num elements of *perm that are permuted/defined */
+       __u32 *perm;
+};
+
+struct crush_bucket_uniform {
+       struct crush_bucket h;
+       __u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+       struct crush_bucket h;
+       __u32 *item_weights;  /* 16-bit fixed point */
+       __u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+                                of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+       struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+                                  actual items */
+       __u8 num_nodes;
+       __u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+       struct crush_bucket h;
+       __u32 *item_weights;   /* 16-bit fixed point */
+       __u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+       struct crush_bucket **buckets;
+       struct crush_rule **rules;
+
+       /*
+        * Parent pointers to identify the parent bucket a device or
+        * bucket in the hierarchy.  If an item appears more than
+        * once, this is the _last_ time it appeared (where buckets
+        * are processed in bucket id order, from -1 on down to
+        * -max_buckets.
+        */
+       __u32 *bucket_parents;
+       __u32 *device_parents;
+
+       __s32 max_buckets;
+       __u32 max_rules;
+       __s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h

new file mode 100644 (file)

index 0000000..91e8842
--- /dev/null
+++ b/include/linux/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef CEPH_CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+                           __u32 e);
+
+#endif
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h

new file mode 100644 (file)

index 0000000..c46b99c
--- /dev/null
+++ b/include/linux/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef CEPH_CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+                        int ruleno,
+                        int x, int *result, int result_max,
+                        int forcefeed,    /* -1 for none */
+                        __u32 *weights);
+
+#endif
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h

index 29b3ce3..2833452 100644 (file)
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -49,7 +49,6 @@ struct task_struct;
  
  #ifdef CONFIG_LOCKDEP
  extern void debug_show_all_locks(void);
-extern void __debug_show_held_locks(struct task_struct *task);
  extern void debug_show_held_locks(struct task_struct *task);
  extern void debug_check_no_locks_freed(const void *from, unsigned long len);
  extern void debug_check_no_locks_held(struct task_struct *task);
@@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void)
  {
  }
  
-static inline void __debug_show_held_locks(struct task_struct *task)
-{
-}
-
  static inline void debug_show_held_locks(struct task_struct *task)
  {
  }
diff --git a/include/linux/dmar.h b/include/linux/dmar.h

index d7cecc9..51651b7 100644 (file)
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -106,6 +106,7 @@ struct irte {
                 __u64 high;
         };
  };
+
  #ifdef CONFIG_INTR_REMAP
  extern int intr_remapping_enabled;
  extern int intr_remapping_supported(void);
@@ -119,11 +120,8 @@ extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
  extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
                         u16 sub_handle);
  extern int map_irq_to_irte_handle(int irq, u16 *sub_handle);
-extern int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index);
-extern int flush_irte(int irq);
  extern int free_irte(int irq);
  
-extern int irq_remapped(int irq);
  extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
  extern struct intel_iommu *map_ioapic_to_ir(int apic);
  extern struct intel_iommu *map_hpet_to_ir(u8 id);
@@ -177,7 +175,6 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
         return 0;
  }
  
-#define irq_remapped(irq)              (0)
  #define enable_intr_remapping(mode)    (-1)
  #define disable_intr_remapping()       (0)
  #define reenable_intr_remapping(mode)  (0)
@@ -187,8 +184,9 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
  /* Can't use the common MSI interrupt functions
   * since DMAR is not a pci device
   */
-extern void dmar_msi_unmask(unsigned int irq);
-extern void dmar_msi_mask(unsigned int irq);
+struct irq_data;
+extern void dmar_msi_unmask(struct irq_data *data);
+extern void dmar_msi_mask(struct irq_data *data);
  extern void dmar_msi_read(int irq, struct msi_msg *msg);
  extern void dmar_msi_write(int irq, struct msi_msg *msg);
  extern int dmar_set_interrupt(struct intel_iommu *iommu);
diff --git a/include/linux/edac.h b/include/linux/edac.h

index 7cf92e8..36c6644 100644 (file)
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -13,6 +13,7 @@
  #define _LINUX_EDAC_H_
  
  #include <asm/atomic.h>
+#include <linux/sysdev.h>
  
  #define EDAC_OPSTATE_INVAL     -1
  #define EDAC_OPSTATE_POLL      0
@@ -22,9 +23,12 @@
  extern int edac_op_state;
  extern int edac_err_assert;
  extern atomic_t edac_handlers;
+extern struct sysdev_class edac_class;
  
  extern int edac_handler_set(void);
  extern void edac_atomic_assert_error(void);
+extern struct sysdev_class *edac_get_sysfs_class(void);
+extern void edac_put_sysfs_class(void);
  
  static inline void opstate_init(void)
  {
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h

index f59ed29..133c0ba 100644 (file)
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
  
  struct fdtable {
         unsigned int max_fds;
-       struct file ** fd;      /* current fd array */
+       struct file __rcu **fd;      /* current fd array */
         fd_set *close_on_exec;
         fd_set *open_fds;
         struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
     * read mostly part
     */
         atomic_t count;
-       struct fdtable *fdt;
+       struct fdtable __rcu *fdt;
         struct fdtable fdtab;
    /*
     * written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
         int next_fd;
         struct embedded_fd_set close_on_exec_init;
         struct embedded_fd_set open_fds_init;
-       struct file * fd_array[NR_OPEN_DEFAULT];
+       struct file __rcu * fd_array[NR_OPEN_DEFAULT];
  };
  
  #define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 63d069b..3168dcf 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
          * Saved mount options for lazy filesystems using
          * generic_show_options()
          */
-       char *s_options;
+       char __rcu *s_options;
  };
  
  extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h

index 5f2f4c4..af3f06b 100644 (file)
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
  struct disk_part_tbl {
         struct rcu_head rcu_head;
         int len;
-       struct hd_struct *last_lookup;
-       struct hd_struct *part[];
+       struct hd_struct __rcu *last_lookup;
+       struct hd_struct __rcu *part[];
  };
  
  struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
          * non-critical accesses use RCU.  Always access through
          * helpers.
          */
-       struct disk_part_tbl *part_tbl;
+       struct disk_part_tbl __rcu *part_tbl;
         struct hd_struct part0;
  
         const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h

index d5b3876..96c323a 100644 (file)
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,8 @@
  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
  #define NMI_OFFSET     (1UL << NMI_SHIFT)
  
+#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
+
  #ifndef PREEMPT_ACTIVE
  #define PREEMPT_ACTIVE_BITS    1
  #define PREEMPT_ACTIVE_SHIFT   (NMI_SHIFT + NMI_BITS)
@@ -82,10 +84,13 @@
  /*
   * Are we doing bottom half or hardware interrupt processing?
   * Are we in a softirq context? Interrupt context?
+ * in_softirq - Are we currently processing softirq or have bh disabled?
+ * in_serving_softirq - Are we currently processing softirq?
   */
  #define in_irq()               (hardirq_count())
  #define in_softirq()           (softirq_count())
  #define in_interrupt()         (irq_count())
+#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
  
  /*
   * Are we in NMI context?
@@ -132,14 +137,16 @@ extern void synchronize_irq(unsigned int irq);
  
  struct task_struct;
  
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
  static inline void account_system_vtime(struct task_struct *tsk)
  {
  }
+#else
+extern void account_system_vtime(struct task_struct *tsk);
  #endif
  
  #if defined(CONFIG_NO_HZ)
-#if defined(CONFIG_TINY_RCU)
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
  extern void rcu_enter_nohz(void);
  extern void rcu_exit_nohz(void);
  
diff --git a/include/linux/htirq.h b/include/linux/htirq.h

index c96ea46..70a1dbb 100644 (file)
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -9,8 +9,9 @@ struct ht_irq_msg {
  /* Helper functions.. */
  void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
  void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
-void mask_ht_irq(unsigned int irq);
-void unmask_ht_irq(unsigned int irq);
+struct irq_data;
+void mask_ht_irq(struct irq_data *data);
+void unmask_ht_irq(struct irq_data *data);
  
  /* The arch hook for getting things started */
  int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
diff --git a/include/linux/idr.h b/include/linux/idr.h

index e968db7..cdb715e 100644 (file)
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
  
  struct idr_layer {
         unsigned long            bitmap; /* A zero bit means "space here" */
-       struct idr_layer        *ary[1<<IDR_BITS];
+       struct idr_layer __rcu  *ary[1<<IDR_BITS];
         int                      count;  /* When zero, we can release it */
         int                      layer;  /* distance from leaf */
         struct rcu_head          rcu_head;
  };
  
  struct idr {
-       struct idr_layer *top;
+       struct idr_layer __rcu *top;
         struct idr_layer *id_free;
         int               layers; /* only valid without concurrent changes */
         int               id_free_cnt;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 1f43fa5..2fea6c8 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -82,11 +82,17 @@ extern struct group_info init_groups;
  # define CAP_INIT_BSET  CAP_FULL_SET
  
  #ifdef CONFIG_TREE_PREEMPT_RCU
+#define INIT_TASK_RCU_TREE_PREEMPT()                                   \
+       .rcu_blocked_node = NULL,
+#else
+#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
  #define INIT_TASK_RCU_PREEMPT(tsk)                                     \
         .rcu_read_lock_nesting = 0,                                     \
         .rcu_read_unlock_special = 0,                                   \
-       .rcu_blocked_node = NULL,                                       \
-       .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),
+       .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
+       INIT_TASK_RCU_TREE_PREEMPT()
  #else
  #define INIT_TASK_RCU_PREEMPT(tsk)
  #endif
@@ -137,8 +143,8 @@ extern struct cred init_cred;
         .children       = LIST_HEAD_INIT(tsk.children),                 \
         .sibling        = LIST_HEAD_INIT(tsk.sibling),                  \
         .group_leader   = &tsk,                                         \
-       .real_cred      = &init_cred,                                   \
-       .cred           = &init_cred,                                   \
+       RCU_INIT_POINTER(.real_cred, &init_cred),                       \
+       RCU_INIT_POINTER(.cred, &init_cred),                            \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(tsk.cred_guard_mutex),             \
         .comm           = "swapper",                                    \
diff --git a/include/linux/input.h b/include/linux/input.h

index 896a922..d6ae176 100644 (file)
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1196,7 +1196,7 @@ struct input_dev {
         int (*flush)(struct input_dev *dev, struct file *file);
         int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);
  
-       struct input_handle *grab;
+       struct input_handle __rcu *grab;
  
         spinlock_t event_lock;
         struct mutex mutex;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index 531495d..4143285 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -647,11 +647,8 @@ static inline void init_irq_proc(void)
  struct seq_file;
  int show_interrupts(struct seq_file *p, void *v);
  
-struct irq_desc;
-
  extern int early_irq_init(void);
  extern int arch_probe_nr_irqs(void);
  extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int node);
  
  #endif
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h

index 64d5291..3e70b21 100644 (file)
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -53,7 +53,7 @@ struct io_context {
  
         struct radix_tree_root radix_root;
         struct hlist_head cic_list;
-       void *ioc_data;
+       void __rcu *ioc_data;
  };
  
  static inline struct io_context *ioc_task_link(struct io_context *ioc)
diff --git a/include/linux/irq.h b/include/linux/irq.h

index c03243a..e963911 100644 (file)
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -72,6 +72,10 @@ typedef      void (*irq_flow_handler_t)(unsigned int irq,
  #define IRQ_ONESHOT            0x08000000      /* IRQ is not unmasked after hardirq */
  #define IRQ_NESTED_THREAD      0x10000000      /* IRQ is nested into another, no own handler thread */
  
+#define IRQF_MODIFY_MASK       \
+       (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
+        IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL)
+
  #ifdef CONFIG_IRQ_PER_CPU
  # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
  # define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
@@ -80,36 +84,77 @@ typedef     void (*irq_flow_handler_t)(unsigned int irq,
  # define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING
  #endif
  
-struct proc_dir_entry;
  struct msi_desc;
  
+/**
+ * struct irq_data - per irq and irq chip data passed down to chip functions
+ * @irq:               interrupt number
+ * @node:              node index useful for balancing
+ * @chip:              low level interrupt hardware access
+ * @handler_data:      per-IRQ data for the irq_chip methods
+ * @chip_data:         platform-specific per-chip private data for the chip
+ *                     methods, to allow shared chip implementations
+ * @msi_desc:          MSI descriptor
+ * @affinity:          IRQ affinity on SMP
+ *
+ * The fields here need to overlay the ones in irq_desc until we
+ * cleaned up the direct references and switched everything over to
+ * irq_data.
+ */
+struct irq_data {
+       unsigned int            irq;
+       unsigned int            node;
+       struct irq_chip         *chip;
+       void                    *handler_data;
+       void                    *chip_data;
+       struct msi_desc         *msi_desc;
+#ifdef CONFIG_SMP
+       cpumask_var_t           affinity;
+#endif
+};
+
  /**
   * struct irq_chip - hardware interrupt chip descriptor
   *
   * @name:              name for /proc/interrupts
- * @startup:           start up the interrupt (defaults to ->enable if NULL)
- * @shutdown:          shut down the interrupt (defaults to ->disable if NULL)
- * @enable:            enable the interrupt (defaults to chip->unmask if NULL)
- * @disable:           disable the interrupt
- * @ack:               start of a new interrupt
- * @mask:              mask an interrupt source
- * @mask_ack:          ack and mask an interrupt source
- * @unmask:            unmask an interrupt source
- * @eoi:               end of interrupt - chip level
- * @end:               end of interrupt - flow level
- * @set_affinity:      set the CPU affinity on SMP machines
- * @retrigger:         resend an IRQ to the CPU
- * @set_type:          set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
- * @set_wake:          enable/disable power-management wake-on of an IRQ
+ * @startup:           deprecated, replaced by irq_startup
+ * @shutdown:          deprecated, replaced by irq_shutdown
+ * @enable:            deprecated, replaced by irq_enable
+ * @disable:           deprecated, replaced by irq_disable
+ * @ack:               deprecated, replaced by irq_ack
+ * @mask:              deprecated, replaced by irq_mask
+ * @mask_ack:          deprecated, replaced by irq_mask_ack
+ * @unmask:            deprecated, replaced by irq_unmask
+ * @eoi:               deprecated, replaced by irq_eoi
+ * @end:               deprecated, will go away with __do_IRQ()
+ * @set_affinity:      deprecated, replaced by irq_set_affinity
+ * @retrigger:         deprecated, replaced by irq_retrigger
+ * @set_type:          deprecated, replaced by irq_set_type
+ * @set_wake:          deprecated, replaced by irq_wake
+ * @bus_lock:          deprecated, replaced by irq_bus_lock
+ * @bus_sync_unlock:   deprecated, replaced by irq_bus_sync_unlock
   *
- * @bus_lock:          function to lock access to slow bus (i2c) chips
- * @bus_sync_unlock:   function to sync and unlock slow bus (i2c) chips
+ * @irq_startup:       start up the interrupt (defaults to ->enable if NULL)
+ * @irq_shutdown:      shut down the interrupt (defaults to ->disable if NULL)
+ * @irq_enable:                enable the interrupt (defaults to chip->unmask if NULL)
+ * @irq_disable:       disable the interrupt
+ * @irq_ack:           start of a new interrupt
+ * @irq_mask:          mask an interrupt source
+ * @irq_mask_ack:      ack and mask an interrupt source
+ * @irq_unmask:                unmask an interrupt source
+ * @irq_eoi:           end of interrupt
+ * @irq_set_affinity:  set the CPU affinity on SMP machines
+ * @irq_retrigger:     resend an IRQ to the CPU
+ * @irq_set_type:      set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
+ * @irq_set_wake:      enable/disable power-management wake-on of an IRQ
+ * @irq_bus_lock:      function to lock access to slow bus (i2c) chips
+ * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
   *
   * @release:           release function solely used by UML
- * @typename:          obsoleted by name, kept as migration helper
   */
  struct irq_chip {
         const char      *name;
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
         unsigned int    (*startup)(unsigned int irq);
         void            (*shutdown)(unsigned int irq);
         void            (*enable)(unsigned int irq);
@@ -130,154 +175,66 @@ struct irq_chip {
  
         void            (*bus_lock)(unsigned int irq);
         void            (*bus_sync_unlock)(unsigned int irq);
+#endif
+       unsigned int    (*irq_startup)(struct irq_data *data);
+       void            (*irq_shutdown)(struct irq_data *data);
+       void            (*irq_enable)(struct irq_data *data);
+       void            (*irq_disable)(struct irq_data *data);
+
+       void            (*irq_ack)(struct irq_data *data);
+       void            (*irq_mask)(struct irq_data *data);
+       void            (*irq_mask_ack)(struct irq_data *data);
+       void            (*irq_unmask)(struct irq_data *data);
+       void            (*irq_eoi)(struct irq_data *data);
+
+       int             (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force);
+       int             (*irq_retrigger)(struct irq_data *data);
+       int             (*irq_set_type)(struct irq_data *data, unsigned int flow_type);
+       int             (*irq_set_wake)(struct irq_data *data, unsigned int on);
+
+       void            (*irq_bus_lock)(struct irq_data *data);
+       void            (*irq_bus_sync_unlock)(struct irq_data *data);
  
         /* Currently used only by UML, might disappear one day.*/
  #ifdef CONFIG_IRQ_RELEASE_METHOD
         void            (*release)(unsigned int irq, void *dev_id);
  #endif
-       /*
-        * For compatibility, ->typename is copied into ->name.
-        * Will disappear.
-        */
-       const char      *typename;
  };
  
-struct timer_rand_state;
-struct irq_2_iommu;
-/**
- * struct irq_desc - interrupt descriptor
- * @irq:               interrupt number for this descriptor
- * @timer_rand_state:  pointer to timer rand state struct
- * @kstat_irqs:                irq stats per cpu
- * @irq_2_iommu:       iommu with this irq
- * @handle_irq:                highlevel irq-events handler [if NULL, __do_IRQ()]
- * @chip:              low level interrupt hardware access
- * @msi_desc:          MSI descriptor
- * @handler_data:      per-IRQ data for the irq_chip methods
- * @chip_data:         platform-specific per-chip private data for the chip
- *                     methods, to allow shared chip implementations
- * @action:            the irq action chain
- * @status:            status information
- * @depth:             disable-depth, for nested irq_disable() calls
- * @wake_depth:                enable depth, for multiple set_irq_wake() callers
- * @irq_count:         stats field to detect stalled irqs
- * @last_unhandled:    aging timer for unhandled count
- * @irqs_unhandled:    stats field for spurious unhandled interrupts
- * @lock:              locking for SMP
- * @affinity:          IRQ affinity on SMP
- * @node:              node index useful for balancing
- * @pending_mask:      pending rebalanced interrupts
- * @threads_active:    number of irqaction threads currently running
- * @wait_for_threads:  wait queue for sync_irq to wait for threaded handlers
- * @dir:               /proc/irq/ procfs entry
- * @name:              flow handler name for /proc/interrupts output
- */
-struct irq_desc {
-       unsigned int            irq;
-       struct timer_rand_state *timer_rand_state;
-       unsigned int            *kstat_irqs;
-#ifdef CONFIG_INTR_REMAP
-       struct irq_2_iommu      *irq_2_iommu;
-#endif
-       irq_flow_handler_t      handle_irq;
-       struct irq_chip         *chip;
-       struct msi_desc         *msi_desc;
-       void                    *handler_data;
-       void                    *chip_data;
-       struct irqaction        *action;        /* IRQ action list */
-       unsigned int            status;         /* IRQ status */
-
-       unsigned int            depth;          /* nested irq disables */
-       unsigned int            wake_depth;     /* nested wake enables */
-       unsigned int            irq_count;      /* For detecting broken IRQs */
-       unsigned long           last_unhandled; /* Aging timer for unhandled count */
-       unsigned int            irqs_unhandled;
-       raw_spinlock_t          lock;
-#ifdef CONFIG_SMP
-       cpumask_var_t           affinity;
-       const struct cpumask    *affinity_hint;
-       unsigned int            node;
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_var_t           pending_mask;
-#endif
-#endif
-       atomic_t                threads_active;
-       wait_queue_head_t       wait_for_threads;
-#ifdef CONFIG_PROC_FS
-       struct proc_dir_entry   *dir;
-#endif
-       const char              *name;
-} ____cacheline_internodealigned_in_smp;
+/* This include will go away once we isolated irq_desc usage to core code */
+#include <linux/irqdesc.h>
  
-extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                       struct irq_desc *desc, int node);
-extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
+/*
+ * Pick up the arch-dependent methods:
+ */
+#include <asm/hw_irq.h>
  
-#ifndef CONFIG_SPARSE_IRQ
-extern struct irq_desc irq_desc[NR_IRQS];
+#ifndef NR_IRQS_LEGACY
+# define NR_IRQS_LEGACY 0
  #endif
  
-#ifdef CONFIG_NUMA_IRQ_DESC
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
-#else
-static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
-       return desc;
-}
+#ifndef ARCH_IRQ_INIT_FLAGS
+# define ARCH_IRQ_INIT_FLAGS   0
  #endif
  
-extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
-
-/*
- * Pick up the arch-dependent methods:
- */
-#include <asm/hw_irq.h>
+#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS)
  
+struct irqaction;
  extern int setup_irq(unsigned int irq, struct irqaction *new);
  extern void remove_irq(unsigned int irq, struct irqaction *act);
  
  #ifdef CONFIG_GENERIC_HARDIRQS
  
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
  void move_native_irq(int irq);
  void move_masked_irq(int irq);
-
-#else /* CONFIG_GENERIC_PENDING_IRQ */
-
-static inline void move_irq(int irq)
-{
-}
-
-static inline void move_native_irq(int irq)
-{
-}
-
-static inline void move_masked_irq(int irq)
-{
-}
-
-#endif /* CONFIG_GENERIC_PENDING_IRQ */
-
-#else /* CONFIG_SMP */
-
-#define move_native_irq(x)
-#define move_masked_irq(x)
-
-#endif /* CONFIG_SMP */
+#else
+static inline void move_native_irq(int irq) { }
+static inline void move_masked_irq(int irq) { }
+#endif
  
  extern int no_irq_affinity;
  
-static inline int irq_balancing_disabled(unsigned int irq)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       return desc->status & IRQ_NO_BALANCING_MASK;
-}
-
  /* Handle irq action chains: */
  extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action);
  
@@ -293,42 +250,10 @@ extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc);
  extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
  extern void handle_nested_irq(unsigned int irq);
  
-/*
- * Monolithic do_IRQ implementation.
- */
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-extern unsigned int __do_IRQ(unsigned int irq);
-#endif
-
-/*
- * Architectures call this to let the generic IRQ layer
- * handle an interrupt. If the descriptor is attached to an
- * irqchip-style controller then we call the ->handle_irq() handler,
- * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
- */
-static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-       desc->handle_irq(irq, desc);
-#else
-       if (likely(desc->handle_irq))
-               desc->handle_irq(irq, desc);
-       else
-               __do_IRQ(irq);
-#endif
-}
-
-static inline void generic_handle_irq(unsigned int irq)
-{
-       generic_handle_irq_desc(irq, irq_to_desc(irq));
-}
-
  /* Handling of unhandled and spurious interrupts: */
  extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
                            irqreturn_t action_ret);
  
-/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
  
  /* Enable/disable irq debugging output: */
  extern int noirqdebug_setup(char *str);
@@ -351,16 +276,6 @@ extern void
  __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                   const char *name);
  
-/* caller has locked the irq_desc and both params are valid */
-static inline void __set_irq_handler_unlocked(int irq,
-                                             irq_flow_handler_t handler)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-       desc->handle_irq = handler;
-}
-
  /*
   * Set a highlevel flow handler for a given IRQ:
   */
@@ -384,141 +299,121 @@ set_irq_chained_handler(unsigned int irq,
  
  extern void set_irq_nested_thread(unsigned int irq, int nest);
  
-extern void set_irq_noprobe(unsigned int irq);
-extern void set_irq_probe(unsigned int irq);
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
+
+static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
+{
+       irq_modify_status(irq, 0, set);
+}
+
+static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr)
+{
+       irq_modify_status(irq, clr, 0);
+}
+
+static inline void set_irq_noprobe(unsigned int irq)
+{
+       irq_modify_status(irq, 0, IRQ_NOPROBE);
+}
+
+static inline void set_irq_probe(unsigned int irq)
+{
+       irq_modify_status(irq, IRQ_NOPROBE, 0);
+}
  
  /* Handle dynamic irq creation and destruction */
  extern unsigned int create_irq_nr(unsigned int irq_want, int node);
  extern int create_irq(void);
  extern void destroy_irq(unsigned int irq);
  
-/* Test to see if a driver has successfully requested an irq */
-static inline int irq_has_action(unsigned int irq)
+/*
+ * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and
+ * irq_free_desc instead.
+ */
+extern void dynamic_irq_cleanup(unsigned int irq);
+static inline void dynamic_irq_init(unsigned int irq)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
-       return desc->action != NULL;
+       dynamic_irq_cleanup(irq);
  }
  
-/* Dynamic irq helper functions */
-extern void dynamic_irq_init(unsigned int irq);
-void dynamic_irq_init_keep_chip_data(unsigned int irq);
-extern void dynamic_irq_cleanup(unsigned int irq);
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq);
-
  /* Set/get chip/data for an IRQ: */
  extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
  extern int set_irq_data(unsigned int irq, void *data);
  extern int set_irq_chip_data(unsigned int irq, void *data);
  extern int set_irq_type(unsigned int irq, unsigned int type);
  extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
+extern struct irq_data *irq_get_irq_data(unsigned int irq);
  
-#define get_irq_chip(irq)      (irq_to_desc(irq)->chip)
-#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data)
-#define get_irq_data(irq)      (irq_to_desc(irq)->handler_data)
-#define get_irq_msi(irq)       (irq_to_desc(irq)->msi_desc)
-
-#define get_irq_desc_chip(desc)                ((desc)->chip)
-#define get_irq_desc_chip_data(desc)   ((desc)->chip_data)
-#define get_irq_desc_data(desc)                ((desc)->handler_data)
-#define get_irq_desc_msi(desc)         ((desc)->msi_desc)
-
-#endif /* CONFIG_GENERIC_HARDIRQS */
-
-#endif /* !CONFIG_S390 */
-
-#ifdef CONFIG_SMP
-/**
- * alloc_desc_masks - allocate cpumasks for irq_desc
- * @desc:      pointer to irq_desc struct
- * @node:      node which will be handling the cpumasks
- * @boot:      true if need bootmem
- *
- * Allocates affinity and pending_mask cpumask if required.
- * Returns true if successful (or not required).
- */
-static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-                                                       bool boot)
+static inline struct irq_chip *get_irq_chip(unsigned int irq)
  {
-       gfp_t gfp = GFP_ATOMIC;
-
-       if (boot)
-               gfp = GFP_NOWAIT;
-
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
-               return false;
+       struct irq_data *d = irq_get_irq_data(irq);
+       return d ? d->chip : NULL;
+}
  
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
-               free_cpumask_var(desc->affinity);
-               return false;
-       }
-#endif
-#endif
-       return true;
+static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d)
+{
+       return d->chip;
  }
  
-static inline void init_desc_masks(struct irq_desc *desc)
+static inline void *get_irq_chip_data(unsigned int irq)
  {
-       cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_clear(desc->pending_mask);
-#endif
+       struct irq_data *d = irq_get_irq_data(irq);
+       return d ? d->chip_data : NULL;
  }
  
-/**
- * init_copy_desc_masks - copy cpumasks for irq_desc
- * @old_desc:  pointer to old irq_desc struct
- * @new_desc:  pointer to new irq_desc struct
- *
- * Insures affinity and pending_masks are copied to new irq_desc.
- * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
- * irq_desc struct so the copy is redundant.
- */
+static inline void *irq_data_get_irq_chip_data(struct irq_data *d)
+{
+       return d->chip_data;
+}
  
-static inline void init_copy_desc_masks(struct irq_desc *old_desc,
-                                       struct irq_desc *new_desc)
+static inline void *get_irq_data(unsigned int irq)
  {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       cpumask_copy(new_desc->affinity, old_desc->affinity);
+       struct irq_data *d = irq_get_irq_data(irq);
+       return d ? d->handler_data : NULL;
+}
  
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
-#endif
-#endif
+static inline void *irq_data_get_irq_data(struct irq_data *d)
+{
+       return d->handler_data;
  }
  
-static inline void free_desc_masks(struct irq_desc *old_desc,
-                                  struct irq_desc *new_desc)
+static inline struct msi_desc *get_irq_msi(unsigned int irq)
  {
-       free_cpumask_var(old_desc->affinity);
+       struct irq_data *d = irq_get_irq_data(irq);
+       return d ? d->msi_desc : NULL;
+}
  
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       free_cpumask_var(old_desc->pending_mask);
-#endif
+static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
+{
+       return d->msi_desc;
  }
  
-#else /* !CONFIG_SMP */
+int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
+void irq_free_descs(unsigned int irq, unsigned int cnt);
+int irq_reserve_irqs(unsigned int from, unsigned int cnt);
  
-static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-                                                               bool boot)
+static inline int irq_alloc_desc(int node)
  {
-       return true;
+       return irq_alloc_descs(-1, 0, 1, node);
  }
  
-static inline void init_desc_masks(struct irq_desc *desc)
+static inline int irq_alloc_desc_at(unsigned int at, int node)
  {
+       return irq_alloc_descs(at, at, 1, node);
  }
  
-static inline void init_copy_desc_masks(struct irq_desc *old_desc,
-                                       struct irq_desc *new_desc)
+static inline int irq_alloc_desc_from(unsigned int from, int node)
  {
+       return irq_alloc_descs(-1, from, 1, node);
  }
  
-static inline void free_desc_masks(struct irq_desc *old_desc,
-                                  struct irq_desc *new_desc)
+static inline void irq_free_desc(unsigned int irq)
  {
+       irq_free_descs(irq, 1);
  }
-#endif /* CONFIG_SMP */
+
+#endif /* CONFIG_GENERIC_HARDIRQS */
+
+#endif /* !CONFIG_S390 */
  
  #endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h

new file mode 100644 (file)

index 0000000..979c68c
--- /dev/null
+++ b/include/linux/irqdesc.h
@@ -0,0 +1,159 @@
+#ifndef _LINUX_IRQDESC_H
+#define _LINUX_IRQDESC_H
+
+/*
+ * Core internal functions to deal with irq descriptors
+ *
+ * This include will move to kernel/irq once we cleaned up the tree.
+ * For now it's included from <linux/irq.h>
+ */
+
+struct proc_dir_entry;
+struct timer_rand_state;
+/**
+ * struct irq_desc - interrupt descriptor
+ * @irq_data:          per irq and chip data passed down to chip functions
+ * @timer_rand_state:  pointer to timer rand state struct
+ * @kstat_irqs:                irq stats per cpu
+ * @handle_irq:                highlevel irq-events handler [if NULL, __do_IRQ()]
+ * @action:            the irq action chain
+ * @status:            status information
+ * @depth:             disable-depth, for nested irq_disable() calls
+ * @wake_depth:                enable depth, for multiple set_irq_wake() callers
+ * @irq_count:         stats field to detect stalled irqs
+ * @last_unhandled:    aging timer for unhandled count
+ * @irqs_unhandled:    stats field for spurious unhandled interrupts
+ * @lock:              locking for SMP
+ * @pending_mask:      pending rebalanced interrupts
+ * @threads_active:    number of irqaction threads currently running
+ * @wait_for_threads:  wait queue for sync_irq to wait for threaded handlers
+ * @dir:               /proc/irq/ procfs entry
+ * @name:              flow handler name for /proc/interrupts output
+ */
+struct irq_desc {
+
+#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+       struct irq_data         irq_data;
+#else
+       /*
+        * This union will go away, once we fixed the direct access to
+        * irq_desc all over the place. The direct fields are a 1:1
+        * overlay of irq_data.
+        */
+       union {
+               struct irq_data         irq_data;
+               struct {
+                       unsigned int            irq;
+                       unsigned int            node;
+                       struct irq_chip         *chip;
+                       void                    *handler_data;
+                       void                    *chip_data;
+                       struct msi_desc         *msi_desc;
+#ifdef CONFIG_SMP
+                       cpumask_var_t           affinity;
+#endif
+               };
+       };
+#endif
+
+       struct timer_rand_state *timer_rand_state;
+       unsigned int            *kstat_irqs;
+       irq_flow_handler_t      handle_irq;
+       struct irqaction        *action;        /* IRQ action list */
+       unsigned int            status;         /* IRQ status */
+
+       unsigned int            depth;          /* nested irq disables */
+       unsigned int            wake_depth;     /* nested wake enables */
+       unsigned int            irq_count;      /* For detecting broken IRQs */
+       unsigned long           last_unhandled; /* Aging timer for unhandled count */
+       unsigned int            irqs_unhandled;
+       raw_spinlock_t          lock;
+#ifdef CONFIG_SMP
+       const struct cpumask    *affinity_hint;
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_var_t           pending_mask;
+#endif
+#endif
+       atomic_t                threads_active;
+       wait_queue_head_t       wait_for_threads;
+#ifdef CONFIG_PROC_FS
+       struct proc_dir_entry   *dir;
+#endif
+       const char              *name;
+} ____cacheline_internodealigned_in_smp;
+
+#ifndef CONFIG_SPARSE_IRQ
+extern struct irq_desc irq_desc[NR_IRQS];
+#endif
+
+/* Will be removed once the last users in power and sh are gone */
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
+       return desc;
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+
+#define get_irq_desc_chip(desc)                ((desc)->irq_data.chip)
+#define get_irq_desc_chip_data(desc)   ((desc)->irq_data.chip_data)
+#define get_irq_desc_data(desc)                ((desc)->irq_data.handler_data)
+#define get_irq_desc_msi(desc)         ((desc)->irq_data.msi_desc)
+
+/*
+ * Monolithic do_IRQ implementation.
+ */
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+extern unsigned int __do_IRQ(unsigned int irq);
+#endif
+
+/*
+ * Architectures call this to let the generic IRQ layer
+ * handle an interrupt. If the descriptor is attached to an
+ * irqchip-style controller then we call the ->handle_irq() handler,
+ * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
+ */
+static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+       desc->handle_irq(irq, desc);
+#else
+       if (likely(desc->handle_irq))
+               desc->handle_irq(irq, desc);
+       else
+               __do_IRQ(irq);
+#endif
+}
+
+static inline void generic_handle_irq(unsigned int irq)
+{
+       generic_handle_irq_desc(irq, irq_to_desc(irq));
+}
+
+/* Test to see if a driver has successfully requested an irq */
+static inline int irq_has_action(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+       return desc->action != NULL;
+}
+
+static inline int irq_balancing_disabled(unsigned int irq)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+       return desc->status & IRQ_NO_BALANCING_MASK;
+}
+
+/* caller has locked the irq_desc and both params are valid */
+static inline void __set_irq_handler_unlocked(int irq,
+                                             irq_flow_handler_t handler)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+       desc->handle_irq = handler;
+}
+#endif
+
+#endif
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h

index 7bf89bc..05aa8c2 100644 (file)
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -25,6 +25,7 @@
  
  extern int nr_irqs;
  extern struct irq_desc *irq_to_desc(unsigned int irq);
+unsigned int irq_get_next_irq(unsigned int offset);
  
  # define for_each_irq_desc(irq, desc)                                  \
         for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;           \
@@ -47,6 +48,10 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
  #define irq_node(irq)  0
  #endif
  
+# define for_each_active_irq(irq)                      \
+       for (irq = irq_get_next_irq(0); irq < nr_irqs;  \
+            irq = irq_get_next_irq(irq + 1))
+
  #endif /* CONFIG_GENERIC_HARDIRQS */
  
  #define for_each_irq_nr(irq)                   \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 2b0a35e..1759ba5 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,18 @@ extern const char linux_proc_banner[];
  
  #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
  #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#define roundup(x, y) (                                        \
+{                                                      \
+       typeof(y) __y = y;                              \
+       (((x) + (__y - 1)) / __y) * __y;                \
+}                                                      \
+)
+#define rounddown(x, y) (                              \
+{                                                      \
+       typeof(x) __x = (x);                            \
+       __x - (__x % (y));                              \
+}                                                      \
+)
  #define DIV_ROUND_CLOSEST(x, divisor)(                 \
  {                                                      \
         typeof(divisor) __divisor = divisor;            \
diff --git a/include/linux/key.h b/include/linux/key.h

index cd50dfa..3db0adc 100644 (file)
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -178,8 +178,9 @@ struct key {
          */
         union {
                 unsigned long           value;
+               void __rcu              *rcudata;
                 void                    *data;
-               struct keyring_list     *subscriptions;
+               struct keyring_list __rcu *subscriptions;
         } payload;
  };
  
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index c13cc48..ac740b2 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,7 +205,7 @@ struct kvm {
  
         struct mutex irq_lock;
  #ifdef CONFIG_HAVE_KVM_IRQCHIP
-       struct kvm_irq_routing_table *irq_routing;
+       struct kvm_irq_routing_table __rcu *irq_routing;
         struct hlist_head mask_notifier_list;
         struct hlist_head irq_ack_notifier_list;
  #endif
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h

index 06aed83..71c09b2 100644 (file)
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -31,6 +31,17 @@ extern int lock_stat;
  
  #define MAX_LOCKDEP_SUBCLASSES         8UL
  
+/*
+ * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
+ * cached in the instance of lockdep_map
+ *
+ * Currently main class (subclass == 0) and signle depth subclass
+ * are cached in lockdep_map. This optimization is mainly targeting
+ * on rq->lock. double_rq_lock() acquires this highly competitive with
+ * single depth.
+ */
+#define NR_LOCKDEP_CACHING_CLASSES     2
+
  /*
   * Lock-classes are keyed via unique addresses, by embedding the
   * lockclass-key into the kernel (or module) .data section. (For
@@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class);
   */
  struct lockdep_map {
         struct lock_class_key           *key;
-       struct lock_class               *class_cache;
+       struct lock_class               *class_cache[NR_LOCKDEP_CACHING_CLASSES];
         const char                      *name;
  #ifdef CONFIG_LOCK_STAT
         int                             cpu;
@@ -424,14 +435,6 @@ do {                                                               \
  
  #endif /* CONFIG_LOCKDEP */
  
-#ifdef CONFIG_GENERIC_HARDIRQS
-extern void early_init_irq_lock_class(void);
-#else
-static inline void early_init_irq_lock_class(void)
-{
-}
-#endif
-
  #ifdef CONFIG_TRACE_IRQFLAGS
  extern void early_boot_irqs_off(void);
  extern void early_boot_irqs_on(void);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index ee7e258..cb57d65 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,7 +299,7 @@ struct mm_struct {
          * new_owner->mm == mm
          * new_owner->alloc_lock is held
          */
-       struct task_struct *owner;
+       struct task_struct __rcu *owner;
  #endif
  
  #ifdef CONFIG_PROC_FS
diff --git a/include/linux/msi.h b/include/linux/msi.h

index 91b05c1..05acced 100644 (file)
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -10,12 +10,13 @@ struct msi_msg {
  };
  
  /* Helper functions */
-struct irq_desc;
-extern void mask_msi_irq(unsigned int irq);
-extern void unmask_msi_irq(unsigned int irq);
-extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
-extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
-extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+struct irq_data;
+struct msi_desc;
+extern void mask_msi_irq(struct irq_data *data);
+extern void unmask_msi_irq(struct irq_data *data);
+extern void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+extern void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+extern void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
  extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
  extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
  extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h

index 9ed534c..70cd060 100644 (file)
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -39,8 +39,9 @@ enum ctattr_type {
         CTA_TUPLE_MASTER,
         CTA_NAT_SEQ_ADJ_ORIG,
         CTA_NAT_SEQ_ADJ_REPLY,
-       CTA_SECMARK,
+       CTA_SECMARK,            /* obsolete */
         CTA_ZONE,
+       CTA_SECCTX,
         __CTA_MAX
  };
  #define CTA_MAX (__CTA_MAX - 1)
@@ -172,4 +173,11 @@ enum ctattr_help {
  };
  #define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
  
+enum ctattr_secctx {
+       CTA_SECCTX_UNSPEC,
+       CTA_SECCTX_NAME,
+       __CTA_SECCTX_MAX
+};
+#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
+
  #endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h

index 6fcd344..989092b 100644 (file)
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -11,18 +11,12 @@
   * packets are being marked for.
   */
  #define SECMARK_MODE_SEL       0x01            /* SELinux */
-#define SECMARK_SELCTX_MAX     256
-
-struct xt_secmark_target_selinux_info {
-       __u32 selsid;
-       char selctx[SECMARK_SELCTX_MAX];
-};
+#define SECMARK_SECCTX_MAX     256
  
  struct xt_secmark_target_info {
         __u8 mode;
-       union {
-               struct xt_secmark_target_selinux_info sel;
-       } u;
+       __u32 secid;
+       char secctx[SECMARK_SECCTX_MAX];
  };
  
  #endif /*_XT_SECMARK_H_target */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h

index 508f8cf..d0edf7d 100644 (file)
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
         struct nfs4_cached_acl  *nfs4_acl;
          /* NFSv4 state */
         struct list_head        open_states;
-       struct nfs_delegation   *delegation;
+       struct nfs_delegation __rcu *delegation;
         fmode_t                  delegation_state;
         struct rw_semaphore     rwsem;
  #endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h

index b2f1a4d..2026f9e 100644 (file)
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
  
  struct notifier_block {
         int (*notifier_call)(struct notifier_block *, unsigned long, void *);
-       struct notifier_block *next;
+       struct notifier_block __rcu *next;
         int priority;
  };
  
  struct atomic_notifier_head {
         spinlock_t lock;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct blocking_notifier_head {
         struct rw_semaphore rwsem;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct raw_notifier_head {
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  struct srcu_notifier_head {
         struct mutex mutex;
         struct srcu_struct srcu;
-       struct notifier_block *head;
+       struct notifier_block __rcu *head;
  };
  
  #define ATOMIC_INIT_NOTIFIER_HEAD(name) do {   \
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h

index 570fdde..2615c37 100644 (file)
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -517,6 +517,7 @@
  #define PCI_DEVICE_ID_AMD_11H_NB_DRAM  0x1302
  #define PCI_DEVICE_ID_AMD_11H_NB_MISC  0x1303
  #define PCI_DEVICE_ID_AMD_11H_NB_LINK  0x1304
+#define PCI_DEVICE_ID_AMD_15H_NB_MISC  0x1603
  #define PCI_DEVICE_ID_AMD_LANCE                0x2000
  #define PCI_DEVICE_ID_AMD_LANCE_HOME   0x2001
  #define PCI_DEVICE_ID_AMD_SCSI         0x2020
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h

index ce2dc65..27ef6b1 100644 (file)
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -138,6 +138,15 @@
         DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
         __aligned(PAGE_SIZE)
  
+/*
+ * Declaration/definition used for per-CPU variables that must be read mostly.
+ */
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                        \
+       DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
+
+#define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
+       DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
+
  /*
   * Intermodule exports for per-CPU variables.  sparse forgets about
   * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h

index 634b8e6..a39cbed 100644 (file)
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
  {
         return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
  }
+#define radix_tree_indirect_to_ptr(ptr) \
+       radix_tree_indirect_to_ptr((void __force *)(ptr))
  
  static inline int radix_tree_is_indirect_ptr(void *ptr)
  {
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
  struct radix_tree_root {
         unsigned int            height;
         gfp_t                   gfp_mask;
-       struct radix_tree_node  *rnode;
+       struct radix_tree_node  __rcu *rnode;
  };
  
  #define RADIX_TREE_INIT(mask)  {                                       \
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index 4ec3b38..f31ef61 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -9,6 +9,21 @@
  #include <linux/list.h>
  #include <linux/rcupdate.h>
  
+/*
+ * Why is there no list_empty_rcu()?  Because list_empty() serves this
+ * purpose.  The list_empty() function fetches the RCU-protected pointer
+ * and compares it to the address of the list head, but neither dereferences
+ * this pointer itself nor provides this pointer to the caller.  Therefore,
+ * it is not necessary to use rcu_dereference(), so that list_empty() can
+ * be used anywhere you would want to use a list_empty_rcu().
+ */
+
+/*
+ * return the ->next pointer of a list_head in an rcu safe
+ * way, we must not access it directly
+ */
+#define list_next_rcu(list)    (*((struct list_head __rcu **)(&(list)->next)))
+
  /*
   * Insert a new entry between two known consecutive entries.
   *
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
  {
         new->next = next;
         new->prev = prev;
-       rcu_assign_pointer(prev->next, new);
+       rcu_assign_pointer(list_next_rcu(prev), new);
         next->prev = new;
  }
  
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
  {
         new->next = old->next;
         new->prev = old->prev;
-       rcu_assign_pointer(new->prev->next, new);
+       rcu_assign_pointer(list_next_rcu(new->prev), new);
         new->next->prev = new;
         old->prev = LIST_POISON2;
  }
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
          */
  
         last->next = at;
-       rcu_assign_pointer(head->next, first);
+       rcu_assign_pointer(list_next_rcu(head), first);
         first->prev = head;
         at->prev = last;
  }
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
   * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
   */
  #define list_entry_rcu(ptr, type, member) \
-       container_of(rcu_dereference_raw(ptr), type, member)
+       ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
+        container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
+       })
  
  /**
   * list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
         list_entry_rcu((ptr)->next, type, member)
  
  #define __list_for_each_rcu(pos, head) \
-       for (pos = rcu_dereference_raw((head)->next); \
+       for (pos = rcu_dereference_raw(list_next_rcu(head)); \
                 pos != (head); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(list_next_rcu((pos)))
  
  /**
   * list_for_each_entry_rcu     -       iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
   * as long as the traversal is guarded by rcu_read_lock().
   */
  #define list_for_each_continue_rcu(pos, head) \
-       for ((pos) = rcu_dereference_raw((pos)->next); \
+       for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
                 prefetch((pos)->next), (pos) != (head); \
-               (pos) = rcu_dereference_raw((pos)->next))
+               (pos) = rcu_dereference_raw(list_next_rcu(pos)))
  
  /**
   * list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
  
         new->next = next;
         new->pprev = old->pprev;
-       rcu_assign_pointer(*new->pprev, new);
+       rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
         if (next)
                 new->next->pprev = &new->next;
         old->pprev = LIST_POISON2;
  }
  
+/*
+ * return the first or the next element in an RCU protected hlist
+ */
+#define hlist_first_rcu(head)  (*((struct hlist_node __rcu **)(&(head)->first)))
+#define hlist_next_rcu(node)   (*((struct hlist_node __rcu **)(&(node)->next)))
+#define hlist_pprev_rcu(node)  (*((struct hlist_node __rcu **)((node)->pprev)))
+
  /**
   * hlist_add_head_rcu
   * @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
  
         n->next = first;
         n->pprev = &h->first;
-       rcu_assign_pointer(h->first, n);
+       rcu_assign_pointer(hlist_first_rcu(h), n);
         if (first)
                 first->pprev = &n->next;
  }
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
  {
         n->pprev = next->pprev;
         n->next = next;
-       rcu_assign_pointer(*(n->pprev), n);
+       rcu_assign_pointer(hlist_pprev_rcu(n), n);
         next->pprev = &n->next;
  }
  
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
  {
         n->next = prev->next;
         n->pprev = &prev->next;
-       rcu_assign_pointer(prev->next, n);
+       rcu_assign_pointer(hlist_next_rcu(prev), n);
         if (n->next)
                 n->next->pprev = &n->next;
  }
  
-#define __hlist_for_each_rcu(pos, head)                        \
-       for (pos = rcu_dereference((head)->first);      \
-            pos && ({ prefetch(pos->next); 1; });      \
-            pos = rcu_dereference(pos->next))
+#define __hlist_for_each_rcu(pos, head)                                \
+       for (pos = rcu_dereference(hlist_first_rcu(head));      \
+            pos && ({ prefetch(pos->next); 1; });              \
+            pos = rcu_dereference(hlist_next_rcu(pos)))
  
  /**
   * hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
   * the _rcu list-mutation primitives such as hlist_add_head_rcu()
   * as long as the traversal is guarded by rcu_read_lock().
   */
-#define hlist_for_each_entry_rcu(tpos, pos, head, member)               \
-       for (pos = rcu_dereference_raw((head)->first);                   \
+#define hlist_for_each_entry_rcu(tpos, pos, head, member)              \
+       for (pos = rcu_dereference_raw(hlist_first_rcu(head));          \
                 pos && ({ prefetch(pos->next); 1; }) &&                  \
                 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(hlist_next_rcu(pos)))
  
  /**
   * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h

index b70ffe5..2ae1371 100644 (file)
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
         }
  }
  
+#define hlist_nulls_first_rcu(head) \
+       (*((struct hlist_nulls_node __rcu __force **)&(head)->first))
+
+#define hlist_nulls_next_rcu(node) \
+       (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
+
  /**
   * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
   * @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
  
         n->next = first;
         n->pprev = &h->first;
-       rcu_assign_pointer(h->first, n);
+       rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
         if (!is_a_nulls(first))
                 first->pprev = &n->next;
  }
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
   * @member:    the name of the hlist_nulls_node within the struct.
   *
   */
-#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
-       for (pos = rcu_dereference_raw((head)->first);                   \
-               (!is_a_nulls(pos)) &&                   \
+#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
+       for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));            \
+               (!is_a_nulls(pos)) &&                                           \
                 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
-               pos = rcu_dereference_raw(pos->next))
+               pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
  
  #endif
  #endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 83af1f8..03cda7b 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,15 @@
  #include <linux/lockdep.h>
  #include <linux/completion.h>
  #include <linux/debugobjects.h>
+#include <linux/compiler.h>
  
  #ifdef CONFIG_RCU_TORTURE_TEST
  extern int rcutorture_runnable; /* for sysctl */
  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
  
+#define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
+#define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
+
  /**
   * struct rcu_head - callback structure for use with RCU
   * @next: next update requests in a list
@@ -57,29 +61,94 @@ struct rcu_head {
  };
  
  /* Exported common interfaces */
-extern void rcu_barrier(void);
+extern void call_rcu_sched(struct rcu_head *head,
+                          void (*func)(struct rcu_head *rcu));
+extern void synchronize_sched(void);
  extern void rcu_barrier_bh(void);
  extern void rcu_barrier_sched(void);
  extern void synchronize_sched_expedited(void);
  extern int sched_expedited_torture_stats(char *page);
  
+static inline void __rcu_read_lock_bh(void)
+{
+       local_bh_disable();
+}
+
+static inline void __rcu_read_unlock_bh(void)
+{
+       local_bh_enable();
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+void synchronize_rcu(void);
+
+/*
+ * Defined as a macro as it is a very low level header included from
+ * areas that don't even know about current.  This gives the rcu_read_lock()
+ * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
+ * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
+ */
+#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
+
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+static inline void __rcu_read_lock(void)
+{
+       preempt_disable();
+}
+
+static inline void __rcu_read_unlock(void)
+{
+       preempt_enable();
+}
+
+static inline void synchronize_rcu(void)
+{
+       synchronize_sched();
+}
+
+static inline int rcu_preempt_depth(void)
+{
+       return 0;
+}
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
  /* Internal to kernel */
  extern void rcu_init(void);
+extern void rcu_sched_qs(int cpu);
+extern void rcu_bh_qs(int cpu);
+extern void rcu_check_callbacks(int cpu, int user);
+struct notifier_block;
+
+#ifdef CONFIG_NO_HZ
+
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static inline void rcu_enter_nohz(void)
+{
+}
+
+static inline void rcu_exit_nohz(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
  
  #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
  #include <linux/rcutree.h>
-#elif defined(CONFIG_TINY_RCU)
+#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
  #include <linux/rcutiny.h>
  #else
  #error "Unknown RCU implementation specified to kernel configuration"
  #endif
  
-#define RCU_HEAD_INIT  { .next = NULL, .func = NULL }
-#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
-#define INIT_RCU_HEAD(ptr) do { \
-       (ptr)->next = NULL; (ptr)->func = NULL; \
-} while (0)
-
  /*
   * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
   * initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
  extern int debug_lockdep_rcu_enabled(void);
  
  /**
- * rcu_read_lock_held - might we be in RCU read-side critical section?
+ * rcu_read_lock_held() - might we be in RCU read-side critical section?
   *
   * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
   * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
   * this assumes we are in an RCU read-side critical section unless it can
- * prove otherwise.
+ * prove otherwise.  This is useful for debug checks in functions that
+ * require that they be called within an RCU read-side critical section.
   *
- * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
   * and while lockdep is disabled.
   */
  static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
  extern int rcu_read_lock_bh_held(void);
  
  /**
- * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
   *
   * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
   * RCU-sched read-side critical section.  In absence of
   * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
   * critical section unless it can prove otherwise.  Note that disabling
   * of preemption (including disabling irqs) counts as an RCU-sched
- * read-side critical section.
+ * read-side critical section.  This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
   *
   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
   * and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
  
  extern int rcu_my_thread_group_empty(void);
  
-#define __do_rcu_dereference_check(c)                                  \
+/**
+ * rcu_lockdep_assert - emit lockdep splat if specified condition not met
+ * @c: condition to check
+ */
+#define rcu_lockdep_assert(c)                                          \
         do {                                                            \
                 static bool __warned;                                   \
                 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
                 }                                                       \
         } while (0)
  
+#else /* #ifdef CONFIG_PROVE_RCU */
+
+#define rcu_lockdep_assert(c) do { } while (0)
+
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
+/*
+ * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
+ * and rcu_assign_pointer().  Some of these could be folded into their
+ * callers, but they are left separate in order to ease introduction of
+ * multiple flavors of pointers to match the multiple flavors of RCU
+ * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
+ * the future.
+ */
+
+#ifdef __CHECKER__
+#define rcu_dereference_sparse(p, space) \
+       ((void)(((typeof(*p) space *)p) == p))
+#else /* #ifdef __CHECKER__ */
+#define rcu_dereference_sparse(p, space)
+#endif /* #else #ifdef __CHECKER__ */
+
+#define __rcu_access_pointer(p, space) \
+       ({ \
+               typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+               rcu_dereference_sparse(p, space); \
+               ((typeof(*p) __force __kernel *)(_________p1)); \
+       })
+#define __rcu_dereference_check(p, c, space) \
+       ({ \
+               typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
+               rcu_lockdep_assert(c); \
+               rcu_dereference_sparse(p, space); \
+               smp_read_barrier_depends(); \
+               ((typeof(*p) __force __kernel *)(_________p1)); \
+       })
+#define __rcu_dereference_protected(p, c, space) \
+       ({ \
+               rcu_lockdep_assert(c); \
+               rcu_dereference_sparse(p, space); \
+               ((typeof(*p) __force __kernel *)(p)); \
+       })
+
+#define __rcu_dereference_index_check(p, c) \
+       ({ \
+               typeof(p) _________p1 = ACCESS_ONCE(p); \
+               rcu_lockdep_assert(c); \
+               smp_read_barrier_depends(); \
+               (_________p1); \
+       })
+#define __rcu_assign_pointer(p, v, space) \
+       ({ \
+               if (!__builtin_constant_p(v) || \
+                   ((v) != NULL)) \
+                       smp_wmb(); \
+               (p) = (typeof(*v) __force space *)(v); \
+       })
+
+
+/**
+ * rcu_access_pointer() - fetch RCU pointer with no dereferencing
+ * @p: The pointer to read
+ *
+ * Return the value of the specified RCU-protected pointer, but omit the
+ * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
+ * when the value of this pointer is accessed, but the pointer is not
+ * dereferenced, for example, when testing an RCU-protected pointer against
+ * NULL.  Although rcu_access_pointer() may also be used in cases where
+ * update-side locks prevent the value of the pointer from changing, you
+ * should instead use rcu_dereference_protected() for this use case.
+ */
+#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
+
  /**
- * rcu_dereference_check - rcu_dereference with debug checking
+ * rcu_dereference_check() - rcu_dereference with debug checking
   * @p: The pointer to read, prior to dereferencing
   * @c: The conditions under which the dereference will take place
   *
   * Do an rcu_dereference(), but check that the conditions under which the
- * dereference will take place are correct.  Typically the conditions indicate
- * the various locking conditions that should be held at that point.  The check
- * should return true if the conditions are satisfied.
+ * dereference will take place are correct.  Typically the conditions
+ * indicate the various locking conditions that should be held at that
+ * point.  The check should return true if the conditions are satisfied.
+ * An implicit check for being in an RCU read-side critical section
+ * (rcu_read_lock()) is included.
   *
   * For example:
   *
- *     bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *                                           lockdep_is_held(&foo->lock));
+ *     bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
   *
   * could be used to indicate to lockdep that foo->bar may only be dereferenced
- * if either the RCU read lock is held, or that the lock required to replace
+ * if either rcu_read_lock() is held, or that the lock required to replace
   * the bar struct at foo->bar is held.
   *
   * Note that the list of conditions may also include indications of when a lock
   * need not be held, for example during initialisation or destruction of the
   * target struct:
   *
- *     bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() ||
- *                                           lockdep_is_held(&foo->lock) ||
+ *     bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
   *                                           atomic_read(&foo->usage) == 0);
+ *
+ * Inserts memory barriers on architectures that require them
+ * (currently only the Alpha), prevents the compiler from refetching
+ * (and from merging fetches), and, more importantly, documents exactly
+ * which pointers are protected by RCU and checks that the pointer is
+ * annotated as __rcu.
   */
  #define rcu_dereference_check(p, c) \
-       ({ \
-               __do_rcu_dereference_check(c); \
-               rcu_dereference_raw(p); \
-       })
+       __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
+
+/**
+ * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_bh_check(p, c) \
+       __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
  
  /**
- * rcu_dereference_protected - fetch RCU pointer when updates prevented
+ * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_check().
+ */
+#define rcu_dereference_sched_check(p, c) \
+       __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
+                               __rcu)
+
+#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
+
+/**
+ * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * Similar to rcu_dereference_check(), but omits the sparse checking.
+ * This allows rcu_dereference_index_check() to be used on integers,
+ * which can then be used as array indices.  Attempting to use
+ * rcu_dereference_check() on an integer will give compiler warnings
+ * because the sparse address-space mechanism relies on dereferencing
+ * the RCU-protected pointer.  Dereferencing integers is not something
+ * that even gcc will put up with.
+ *
+ * Note that this function does not implicitly check for RCU read-side
+ * critical sections.  If this function gains lots of uses, it might
+ * make sense to provide versions for each flavor of RCU, but it does
+ * not make sense as of early 2010.
+ */
+#define rcu_dereference_index_check(p, c) \
+       __rcu_dereference_index_check((p), (c))
+
+/**
+ * rcu_dereference_protected() - fetch RCU pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
   *
   * Return the value of the specified RCU-protected pointer, but omit
   * both the smp_read_barrier_depends() and the ACCESS_ONCE().  This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
   * prevent the compiler from repeating this reference or combining it
   * with other references, so it should not be used without protection
   * of appropriate locks.
+ *
+ * This function is only for update-side use.  Using this function
+ * when protected only by rcu_read_lock() will result in infrequent
+ * but very ugly failures.
   */
  #define rcu_dereference_protected(p, c) \
-       ({ \
-               __do_rcu_dereference_check(c); \
-               (p); \
-       })
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#else /* #ifdef CONFIG_PROVE_RCU */
+/**
+ * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-bh counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_bh_protected(p, c) \
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#define rcu_dereference_check(p, c)    rcu_dereference_raw(p)
-#define rcu_dereference_protected(p, c) (p)
+/**
+ * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
+ * @p: The pointer to read, prior to dereferencing
+ * @c: The conditions under which the dereference will take place
+ *
+ * This is the RCU-sched counterpart to rcu_dereference_protected().
+ */
+#define rcu_dereference_sched_protected(p, c) \
+       __rcu_dereference_protected((p), (c), __rcu)
  
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
  
  /**
- * rcu_access_pointer - fetch RCU pointer with no dereferencing
+ * rcu_dereference() - fetch RCU-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
   *
- * Return the value of the specified RCU-protected pointer, but omit the
- * smp_read_barrier_depends() and keep the ACCESS_ONCE().  This is useful
- * when the value of this pointer is accessed, but the pointer is not
- * dereferenced, for example, when testing an RCU-protected pointer against
- * NULL.  This may also be used in cases where update-side locks prevent
- * the value of the pointer from changing, but rcu_dereference_protected()
- * is a lighter-weight primitive for this use case.
+ * This is a simple wrapper around rcu_dereference_check().
+ */
+#define rcu_dereference(p) rcu_dereference_check(p, 0)
+
+/**
+ * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
+ */
+#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
+
+/**
+ * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
+ * @p: The pointer to read, prior to dereferencing
+ *
+ * Makes rcu_dereference_check() do the dirty work.
   */
-#define rcu_access_pointer(p)  ACCESS_ONCE(p)
+#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
  
  /**
- * rcu_read_lock - mark the beginning of an RCU read-side critical section.
+ * rcu_read_lock() - mark the beginning of an RCU read-side critical section
   *
   * When synchronize_rcu() is invoked on one CPU while other CPUs
   * are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
   * until after the all the other CPUs exit their critical sections.
   *
   * Note, however, that RCU callbacks are permitted to run concurrently
- * with RCU read-side critical sections.  One way that this can happen
+ * with new RCU read-side critical sections.  One way that this can happen
   * is via the following sequence of events: (1) CPU 0 enters an RCU
   * read-side critical section, (2) CPU 1 invokes call_rcu() to register
   * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
   * will be deferred until the outermost RCU read-side critical section
   * completes.
   *
- * It is illegal to block while in an RCU read-side critical section.
+ * You can avoid reading and understanding the next paragraph by
+ * following this rule: don't put anything in an rcu_read_lock() RCU
+ * read-side critical section that would block in a !PREEMPT kernel.
+ * But if you want the full story, read on!
+ *
+ * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
+ * is illegal to block while in an RCU read-side critical section.  In
+ * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
+ * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
+ * be preempted, but explicit blocking is illegal.  Finally, in preemptible
+ * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
+ * RCU read-side critical sections may be preempted and they may also
+ * block, but only when acquiring spinlocks that are subject to priority
+ * inheritance.
   */
  static inline void rcu_read_lock(void)
  {
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
   */
  
  /**
- * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ * rcu_read_unlock() - marks the end of an RCU read-side critical section.
   *
   * See rcu_read_lock() for more information.
   */
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
  }
  
  /**
- * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
+ * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
   *
   * This is equivalent of rcu_read_lock(), but to be used when updates
- * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks
- * consider completion of a softirq handler to be a quiescent state,
- * a process in RCU read-side critical section must be protected by
- * disabling softirqs. Read-side critical sections in interrupt context
- * can use just rcu_read_lock().
- *
+ * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
+ * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
+ * softirq handler to be a quiescent state, a process in RCU read-side
+ * critical section must be protected by disabling softirqs. Read-side
+ * critical sections in interrupt context can use just rcu_read_lock(),
+ * though this should at least be commented to avoid confusing people
+ * reading the code.
   */
  static inline void rcu_read_lock_bh(void)
  {
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
  }
  
  /**
- * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section
+ * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
   *
- * Should be used with either
- * - synchronize_sched()
- * or
- * - call_rcu_sched() and rcu_barrier_sched()
- * on the write-side to insure proper synchronization.
+ * This is equivalent of rcu_read_lock(), but to be used when updates
+ * are being done using call_rcu_sched() or synchronize_rcu_sched().
+ * Read-side critical sections can also be introduced by anything that
+ * disables preemption, including local_irq_disable() and friends.
   */
  static inline void rcu_read_lock_sched(void)
  {
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
         preempt_enable_notrace();
  }
  
-
  /**
- * rcu_dereference_raw - fetch an RCU-protected pointer
+ * rcu_assign_pointer() - assign to RCU-protected pointer
+ * @p: pointer to assign to
+ * @v: value to assign (publish)
   *
- * The caller must be within some flavor of RCU read-side critical
- * section, or must be otherwise preventing the pointer from changing,
- * for example, by holding an appropriate lock.  This pointer may later
- * be safely dereferenced.  It is the caller's responsibility to have
- * done the right thing, as this primitive does no checking of any kind.
- *
- * Inserts memory barriers on architectures that require them
- * (currently only the Alpha), and, more importantly, documents
- * exactly which pointers are protected by RCU.
- */
-#define rcu_dereference_raw(p) ({ \
-                               typeof(p) _________p1 = ACCESS_ONCE(p); \
-                               smp_read_barrier_depends(); \
-                               (_________p1); \
-                               })
-
-/**
- * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference(p) \
-       rcu_dereference_check(p, rcu_read_lock_held())
-
-/**
- * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_bh(p) \
-               rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
-
-/**
- * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
- *
- * Makes rcu_dereference_check() do the dirty work.
- */
-#define rcu_dereference_sched(p) \
-               rcu_dereference_check(p, rcu_read_lock_sched_held())
-
-/**
- * rcu_assign_pointer - assign (publicize) a pointer to a newly
- * initialized structure that will be dereferenced by RCU read-side
- * critical sections.  Returns the value assigned.
+ * Assigns the specified value to the specified RCU-protected
+ * pointer, ensuring that any concurrent RCU readers will see
+ * any prior initialization.  Returns the value assigned.
   *
   * Inserts memory barriers on architectures that require them
   * (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
   * call documents which pointers will be dereferenced by RCU read-side
   * code.
   */
-
  #define rcu_assign_pointer(p, v) \
-       ({ \
-               if (!__builtin_constant_p(v) || \
-                   ((v) != NULL)) \
-                       smp_wmb(); \
-               (p) = (v); \
-       })
+       __rcu_assign_pointer((p), (v), __rcu)
+
+/**
+ * RCU_INIT_POINTER() - initialize an RCU protected pointer
+ *
+ * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
+ * splats.
+ */
+#define RCU_INIT_POINTER(p, v) \
+               p = (typeof(*v) __force __rcu *)(v)
  
  /* Infrastructure to implement the synchronize_() primitives. */
  
@@ -494,26 +694,37 @@ struct rcu_synchronize {
  
  extern void wakeme_after_rcu(struct rcu_head  *head);
  
+#ifdef CONFIG_PREEMPT_RCU
+
  /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
   * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
   *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
   * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
   * and may be nested.
   */
  extern void call_rcu(struct rcu_head *head,
                               void (*func)(struct rcu_head *head));
  
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+
+/* In classic RCU, call_rcu() is just call_rcu_sched(). */
+#define        call_rcu        call_rcu_sched
+
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
  /**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
   * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * @func: actual callback function to be invoked after the grace period
   *
- * The update function will be invoked some time after a full grace
+ * The callback function will be invoked some time after a full grace
   * period elapses, in other words after all currently executing RCU
   * read-side critical sections have completed. call_rcu_bh() assumes
   * that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
  }
  #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
  
-#ifndef CONFIG_PROVE_RCU
-#define __do_rcu_dereference_check(c) do { } while (0)
-#endif /* #ifdef CONFIG_PROVE_RCU */
-
-#define __rcu_dereference_index_check(p, c) \
-       ({ \
-               typeof(p) _________p1 = ACCESS_ONCE(p); \
-               __do_rcu_dereference_check(c); \
-               smp_read_barrier_depends(); \
-               (_________p1); \
-       })
-
-/**
- * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
- * @p: The pointer to read, prior to dereferencing
- * @c: The conditions under which the dereference will take place
- *
- * Similar to rcu_dereference_check(), but omits the sparse checking.
- * This allows rcu_dereference_index_check() to be used on integers,
- * which can then be used as array indices.  Attempting to use
- * rcu_dereference_check() on an integer will give compiler warnings
- * because the sparse address-space mechanism relies on dereferencing
- * the RCU-protected pointer.  Dereferencing integers is not something
- * that even gcc will put up with.
- *
- * Note that this function does not implicitly check for RCU read-side
- * critical sections.  If this function gains lots of uses, it might
- * make sense to provide versions for each flavor of RCU, but it does
- * not make sense as of early 2010.
- */
-#define rcu_dereference_index_check(p, c) \
-       __rcu_dereference_index_check((p), (c))
-
  #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index e2e8931..13877cb 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,103 +27,101 @@
  
  #include <linux/cache.h>
  
-void rcu_sched_qs(int cpu);
-void rcu_bh_qs(int cpu);
-static inline void rcu_note_context_switch(int cpu)
-{
-       rcu_sched_qs(cpu);
-}
+#define rcu_init_sched()       do { } while (0)
  
-#define __rcu_read_lock()      preempt_disable()
-#define __rcu_read_unlock()    preempt_enable()
-#define __rcu_read_lock_bh()   local_bh_disable()
-#define __rcu_read_unlock_bh() local_bh_enable()
-#define call_rcu_sched         call_rcu
+#ifdef CONFIG_TINY_RCU
  
-#define rcu_init_sched()       do { } while (0)
-extern void rcu_check_callbacks(int cpu, int user);
+static inline void synchronize_rcu_expedited(void)
+{
+       synchronize_sched();    /* Only one CPU, so pretty fast anyway!!! */
+}
  
-static inline int rcu_needs_cpu(int cpu)
+static inline void rcu_barrier(void)
  {
-       return 0;
+       rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
  }
  
-/*
- * Return the number of grace periods.
- */
-static inline long rcu_batches_completed(void)
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_barrier(void);
+void synchronize_rcu_expedited(void);
+
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void synchronize_rcu_bh(void)
  {
-       return 0;
+       synchronize_sched();
  }
  
-/*
- * Return the number of bottom-half grace periods.
- */
-static inline long rcu_batches_completed_bh(void)
+static inline void synchronize_rcu_bh_expedited(void)
  {
-       return 0;
+       synchronize_sched();
  }
  
-static inline void rcu_force_quiescent_state(void)
+#ifdef CONFIG_TINY_RCU
+
+static inline void rcu_preempt_note_context_switch(void)
  {
  }
  
-static inline void rcu_bh_force_quiescent_state(void)
+static inline void exit_rcu(void)
  {
  }
  
-static inline void rcu_sched_force_quiescent_state(void)
+static inline int rcu_needs_cpu(int cpu)
  {
+       return 0;
  }
  
-extern void synchronize_sched(void);
+#else /* #ifdef CONFIG_TINY_RCU */
+
+void rcu_preempt_note_context_switch(void);
+extern void exit_rcu(void);
+int rcu_preempt_needs_cpu(void);
  
-static inline void synchronize_rcu(void)
+static inline int rcu_needs_cpu(int cpu)
  {
-       synchronize_sched();
+       return rcu_preempt_needs_cpu();
  }
  
-static inline void synchronize_rcu_bh(void)
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
+static inline void rcu_note_context_switch(int cpu)
  {
-       synchronize_sched();
+       rcu_sched_qs(cpu);
+       rcu_preempt_note_context_switch();
  }
  
-static inline void synchronize_rcu_expedited(void)
+/*
+ * Return the number of grace periods.
+ */
+static inline long rcu_batches_completed(void)
  {
-       synchronize_sched();
+       return 0;
  }
  
-static inline void synchronize_rcu_bh_expedited(void)
+/*
+ * Return the number of bottom-half grace periods.
+ */
+static inline long rcu_batches_completed_bh(void)
  {
-       synchronize_sched();
+       return 0;
  }
  
-struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
-
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-static inline void rcu_enter_nohz(void)
+static inline void rcu_force_quiescent_state(void)
  {
  }
  
-static inline void rcu_exit_nohz(void)
+static inline void rcu_bh_force_quiescent_state(void)
  {
  }
  
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-static inline void exit_rcu(void)
+static inline void rcu_sched_force_quiescent_state(void)
  {
  }
  
-static inline int rcu_preempt_depth(void)
+static inline void rcu_cpu_stall_reset(void)
  {
-       return 0;
  }
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index c0ed1c0..95518e6 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,23 @@
  #ifndef __LINUX_RCUTREE_H
  #define __LINUX_RCUTREE_H
  
-struct notifier_block;
-
-extern void rcu_sched_qs(int cpu);
-extern void rcu_bh_qs(int cpu);
  extern void rcu_note_context_switch(int cpu);
  extern int rcu_needs_cpu(int cpu);
+extern void rcu_cpu_stall_reset(void);
  
  #ifdef CONFIG_TREE_PREEMPT_RCU
  
-extern void __rcu_read_lock(void);
-extern void __rcu_read_unlock(void);
-extern void synchronize_rcu(void);
  extern void exit_rcu(void);
  
-/*
- * Defined as macro as it is a very low level header
- * included from areas that don't even know about current
- */
-#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
-
  #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  
-static inline void __rcu_read_lock(void)
-{
-       preempt_disable();
-}
-
-static inline void __rcu_read_unlock(void)
-{
-       preempt_enable();
-}
-
-#define synchronize_rcu synchronize_sched
-
  static inline void exit_rcu(void)
  {
  }
  
-static inline int rcu_preempt_depth(void)
-{
-       return 0;
-}
-
  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
-static inline void __rcu_read_lock_bh(void)
-{
-       local_bh_disable();
-}
-static inline void __rcu_read_unlock_bh(void)
-{
-       local_bh_enable();
-}
-
-extern void call_rcu_sched(struct rcu_head *head,
-                          void (*func)(struct rcu_head *rcu));
  extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
  extern void synchronize_rcu_expedited(void);
  
  static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void)
         synchronize_sched_expedited();
  }
  
-extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_barrier(void);
  
  extern long rcu_batches_completed(void);
  extern long rcu_batches_completed_bh(void);
@@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void);
  extern void rcu_bh_force_quiescent_state(void);
  extern void rcu_sched_force_quiescent_state(void);
  
-#ifdef CONFIG_NO_HZ
-void rcu_enter_nohz(void);
-void rcu_exit_nohz(void);
-#else /* CONFIG_NO_HZ */
-static inline void rcu_enter_nohz(void)
-{
-}
-static inline void rcu_exit_nohz(void)
-{
-}
-#endif /* CONFIG_NO_HZ */
-
  /* A context switch is a grace period for RCU-sched and RCU-bh. */
  static inline int rcu_blocking_is_gp(void)
  {
diff --git a/include/linux/sched.h b/include/linux/sched.h

index eb3c1ce..0383601 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
         SD_LV_MC,
+       SD_LV_BOOK,
         SD_LV_CPU,
         SD_LV_NODE,
         SD_LV_ALLNODES,
@@ -1209,11 +1210,13 @@ struct task_struct {
         unsigned int policy;
         cpumask_t cpus_allowed;
  
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
         int rcu_read_lock_nesting;
         char rcu_read_unlock_special;
-       struct rcu_node *rcu_blocked_node;
         struct list_head rcu_node_entry;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+       struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1295,9 +1298,9 @@ struct task_struct {
         struct list_head cpu_timers[3];
  
  /* process credentials */
-       const struct cred *real_cred;   /* objective and real subjective task
+       const struct cred __rcu *real_cred; /* objective and real subjective task
                                          * credentials (COW) */
-       const struct cred *cred;        /* effective (overridable) subjective task
+       const struct cred __rcu *cred;  /* effective (overridable) subjective task
                                          * credentials (COW) */
         struct mutex cred_guard_mutex;  /* guard against foreign influences on
                                          * credential calculations
@@ -1425,7 +1428,7 @@ struct task_struct {
  #endif
  #ifdef CONFIG_CGROUPS
         /* Control Group info protected by css_set_lock */
-       struct css_set *cgroups;
+       struct css_set __rcu *cgroups;
         /* cg_list protected by css_set_lock and tsk->alloc_lock */
         struct list_head cg_list;
  #endif
@@ -1688,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
  /*
   * Per process flags
   */
-#define PF_ALIGNWARN   0x00000001      /* Print alignment warning msgs */
-                                       /* Not implemented yet, only for 486*/
+#define PF_KSOFTIRQD   0x00000001      /* I am ksoftirqd */
  #define PF_STARTING    0x00000002      /* being created */
  #define PF_EXITING     0x00000004      /* getting shut down */
  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
@@ -1747,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
  #define used_math() tsk_used_math(current)
  
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
  
  #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
  #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@ -1756,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p)
  {
         p->rcu_read_lock_nesting = 0;
         p->rcu_read_unlock_special = 0;
+#ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
+#endif
         INIT_LIST_HEAD(&p->rcu_node_entry);
  }
  
@@ -1833,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
  #endif
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
+ * The reason for this explicit opt-in is not to have perf penalty with
+ * slow sched_clocks.
+ */
+extern void enable_sched_clock_irqtime(void);
+extern void disable_sched_clock_irqtime(void);
+#else
+static inline void enable_sched_clock_irqtime(void) {}
+static inline void disable_sched_clock_irqtime(void) {}
+#endif
+
  extern unsigned long long
  task_sched_runtime(struct task_struct *task);
  extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@ -2374,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
  
  extern int __cond_resched_softirq(void);
  
-#define cond_resched_softirq() ({                              \
-       __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET);      \
-       __cond_resched_softirq();                               \
+#define cond_resched_softirq() ({                                      \
+       __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);      \
+       __cond_resched_softirq();                                       \
  })
  
  /*
diff --git a/include/linux/security.h b/include/linux/security.h

index a22219a..b8246a8 100644 (file)
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot,
  extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
  extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                           unsigned long arg4, unsigned long arg5);
-extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp);
+extern int cap_task_setscheduler(struct task_struct *p);
  extern int cap_task_setioprio(struct task_struct *p, int ioprio);
  extern int cap_task_setnice(struct task_struct *p, int nice);
  extern int cap_syslog(int type, bool from_file);
@@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   *     Sets the new child socket's sid to the openreq sid.
   * @inet_conn_established:
   *     Sets the connection's peersid to the secmark on skb.
+ * @secmark_relabel_packet:
+ *     check if the process should be allowed to relabel packets to the given secid
+ * @security_secmark_refcount_inc
+ *     tells the LSM to increment the number of secmark labeling rules loaded
+ * @security_secmark_refcount_dec
+ *     tells the LSM to decrement the number of secmark labeling rules loaded
   * @req_classify_flow:
   *     Sets the flow's sid to the openreq sid.
   * @tun_dev_create:
@@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   *     Return 0 if permission is granted.
   *
   * @secid_to_secctx:
- *     Convert secid to security context.
+ *     Convert secid to security context.  If secdata is NULL the length of
+ *     the result will be returned in seclen, but no secdata will be returned.
+ *     This does mean that the length could change between calls to check the
+ *     length and the next call which actually allocates and returns the secdata.
   *     @secid contains the security ID.
   *     @secdata contains the pointer that stores the converted security context.
+ *     @seclen pointer which contains the length of the data
   * @secctx_to_secid:
   *     Convert security context to secid.
   *     @secid contains the pointer to the generated security ID.
@@ -1501,8 +1511,7 @@ struct security_operations {
         int (*task_getioprio) (struct task_struct *p);
         int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
                         struct rlimit *new_rlim);
-       int (*task_setscheduler) (struct task_struct *p, int policy,
-                                 struct sched_param *lp);
+       int (*task_setscheduler) (struct task_struct *p);
         int (*task_getscheduler) (struct task_struct *p);
         int (*task_movememory) (struct task_struct *p);
         int (*task_kill) (struct task_struct *p,
@@ -1594,6 +1603,9 @@ struct security_operations {
                                   struct request_sock *req);
         void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
         void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
+       int (*secmark_relabel_packet) (u32 secid);
+       void (*secmark_refcount_inc) (void);
+       void (*secmark_refcount_dec) (void);
         void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
         int (*tun_dev_create)(void);
         void (*tun_dev_post_create)(struct sock *sk);
@@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio);
  int security_task_getioprio(struct task_struct *p);
  int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                 struct rlimit *new_rlim);
-int security_task_setscheduler(struct task_struct *p,
-                               int policy, struct sched_param *lp);
+int security_task_setscheduler(struct task_struct *p);
  int security_task_getscheduler(struct task_struct *p);
  int security_task_movememory(struct task_struct *p);
  int security_task_kill(struct task_struct *p, struct siginfo *info,
@@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p,
         return 0;
  }
  
-static inline int security_task_setscheduler(struct task_struct *p,
-                                            int policy,
-                                            struct sched_param *lp)
+static inline int security_task_setscheduler(struct task_struct *p)
  {
-       return cap_task_setscheduler(p, policy, lp);
+       return cap_task_setscheduler(p);
  }
  
  static inline int security_task_getscheduler(struct task_struct *p)
@@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk,
                         const struct request_sock *req);
  void security_inet_conn_established(struct sock *sk,
                         struct sk_buff *skb);
+int security_secmark_relabel_packet(u32 secid);
+void security_secmark_refcount_inc(void);
+void security_secmark_refcount_dec(void);
  int security_tun_dev_create(void);
  void security_tun_dev_post_create(struct sock *sk);
  int security_tun_dev_attach(struct sock *sk);
@@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk,
  {
  }
  
+static inline int security_secmark_relabel_packet(u32 secid)
+{
+       return 0;
+}
+
+static inline void security_secmark_refcount_inc(void)
+{
+}
+
+static inline void security_secmark_refcount_dec(void)
+{
+}
+
  static inline int security_tun_dev_create(void)
  {
         return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h

index 82e0f26..44f4596 100644 (file)
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -20,75 +20,12 @@ struct kern_ipc_perm;
  
  #ifdef CONFIG_SECURITY_SELINUX
  
-/**
- *     selinux_string_to_sid - map a security context string to a security ID
- *     @str: the security context string to be mapped
- *     @sid: ID value returned via this.
- *
- *     Returns 0 if successful, with the SID stored in sid.  A value
- *     of zero for sid indicates no SID could be determined (but no error
- *     occurred).
- */
-int selinux_string_to_sid(char *str, u32 *sid);
-
-/**
- *     selinux_secmark_relabel_packet_permission - secmark permission check
- *     @sid: SECMARK ID value to be applied to network packet
- *
- *     Returns 0 if the current task is allowed to set the SECMARK label of
- *     packets with the supplied security ID.  Note that it is implicit that
- *     the packet is always being relabeled from the default unlabeled value,
- *     and that the access control decision is made in the AVC.
- */
-int selinux_secmark_relabel_packet_permission(u32 sid);
-
-/**
- *     selinux_secmark_refcount_inc - increments the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function incements this reference count to indicate that a new SECMARK
- *     target has been configured.
- */
-void selinux_secmark_refcount_inc(void);
-
-/**
- *     selinux_secmark_refcount_dec - decrements the secmark use counter
- *
- *     SELinux keeps track of the current SECMARK targets in use so it knows
- *     when to apply SECMARK label access checks to network packets.  This
- *     function decements this reference count to indicate that one of the
- *     existing SECMARK targets has been removed/flushed.
- */
-void selinux_secmark_refcount_dec(void);
-
  /**
   * selinux_is_enabled - is SELinux enabled?
   */
  bool selinux_is_enabled(void);
  #else
  
-static inline int selinux_string_to_sid(const char *str, u32 *sid)
-{
-       *sid = 0;
-       return 0;
-}
-
-static inline int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-       return 0;
-}
-
-static inline void selinux_secmark_refcount_inc(void)
-{
-       return;
-}
-
-static inline void selinux_secmark_refcount_dec(void)
-{
-       return;
-}
-
  static inline bool selinux_is_enabled(void)
  {
         return false;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h

index 4d5d2f5..58971e8 100644 (file)
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
  /**
- * srcu_dereference - fetch SRCU-protected pointer with checking
+ * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *     really are in an SRCU read-side critical section.
+ * @c: condition to check for update-side use
   *
- * Makes rcu_dereference_check() do the dirty work.
+ * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
+ * critical section will result in an RCU-lockdep splat, unless @c evaluates
+ * to 1.  The @c argument will normally be a logical expression containing
+ * lockdep_is_held() calls.
   */
-#define srcu_dereference(p, sp) \
-               rcu_dereference_check(p, srcu_read_lock_held(sp))
+#define srcu_dereference_check(p, sp, c) \
+       __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
+
+/**
+ * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
+ * @p: the pointer to fetch and protect for later dereferencing
+ * @sp: pointer to the srcu_struct, which is used to check that we
+ *     really are in an SRCU read-side critical section.
+ *
+ * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
+ * is enabled, invoking this outside of an RCU read-side critical
+ * section will result in an RCU-lockdep splat.
+ */
+#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
  
  /**
   * srcu_read_lock - register a new reader for an SRCU-protected structure.
   * @sp: srcu_struct in which to register the new reader.
   *
   * Enter an SRCU read-side critical section.  Note that SRCU read-side
- * critical sections may be nested.
+ * critical sections may be nested.  However, it is illegal to
+ * call anything that waits on an SRCU grace period for the same
+ * srcu_struct, whether directly or indirectly.  Please note that
+ * one way to indirectly wait on an SRCU grace period is to acquire
+ * a mutex that is held elsewhere while calling synchronize_srcu() or
+ * synchronize_srcu_expedited().
   */
  static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
  {
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h

index 671538d..8eee9db 100644 (file)
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
         enum rpc_gss_proc       gc_proc;
         u32                     gc_seq;
         spinlock_t              gc_seq_lock;
-       struct gss_ctx          *gc_gss_ctx;
+       struct gss_ctx __rcu    *gc_gss_ctx;
         struct xdr_netobj       gc_wire_ctx;
         u32                     gc_win;
         unsigned long           gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
  struct gss_cred {
         struct rpc_cred         gc_base;
         enum rpc_gss_svc        gc_service;
-       struct gss_cl_ctx       *gc_ctx;
+       struct gss_cl_ctx __rcu *gc_ctx;
         struct gss_upcall_msg   *gc_upcall;
         unsigned long           gc_upcall_timestamp;
         unsigned char           gc_machine_cred : 1;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h

index a8cc4e1..c906965 100644 (file)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -23,12 +23,12 @@ struct restart_block {
                 };
                 /* For futex_wait and futex_wait_requeue_pi */
                 struct {
-                       u32 *uaddr;
+                       u32 __user *uaddr;
                         u32 val;
                         u32 flags;
                         u32 bitset;
                         u64 time;
-                       u32 *uaddr2;
+                       u32 __user *uaddr2;
                 } futex;
                 /* For nanosleep */
                 struct {
diff --git a/include/linux/topology.h b/include/linux/topology.h

index 64e084f..b91a40e 100644 (file)
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
         .balance_interval       = 64,                                   \
  }
  
+#ifdef CONFIG_SCHED_BOOK
+#ifndef SD_BOOK_INIT
+#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
+#endif
+#endif /* CONFIG_SCHED_BOOK */
+
  #ifdef CONFIG_NUMA
  #ifndef SD_NODE_INIT
  #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h

index ef6c24a..a4dc5b0 100644 (file)
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
                 return 0;
  
         rcu_read_lock();
-       id = rcu_dereference(net_cls_subsys_id);
+       id = rcu_dereference_index_check(net_cls_subsys_id,
+                                        rcu_read_lock_held());
         if (id >= 0)
                 classid = container_of(task_subsys_state(p, id),
                                        struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h

index e624dae..caf17db 100644 (file)
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
  /* nf_conn feature for connections that have a helper */
  struct nf_conn_help {
         /* Helper. if any */
-       struct nf_conntrack_helper *helper;
+       struct nf_conntrack_helper __rcu *helper;
  
         union nf_conntrack_help help;
  
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 9208c92..f633478 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
                         (unsigned long long)__entry->vruntime)
  );
  
+/*
+ * Tracepoint for showing priority inheritance modifying a tasks
+ * priority.
+ */
+TRACE_EVENT(sched_pi_setprio,
+
+       TP_PROTO(struct task_struct *tsk, int newprio),
+
+       TP_ARGS(tsk, newprio),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN   )
+               __field( pid_t, pid                     )
+               __field( int,   oldprio                 )
+               __field( int,   newprio                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid            = tsk->pid;
+               __entry->oldprio        = tsk->prio;
+               __entry->newprio        = newprio;
+       ),
+
+       TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
+                       __entry->comm, __entry->pid,
+                       __entry->oldprio, __entry->newprio)
+);
+
  #endif /* _TRACE_SCHED_H */
  
  /* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig

index 1ef0b43..36890f0 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -339,6 +339,8 @@ config AUDIT_TREE
         depends on AUDITSYSCALL
         select FSNOTIFY
  
+source "kernel/irq/Kconfig"
+
  menu "RCU Subsystem"
  
  choice
@@ -347,6 +349,7 @@ choice
  
  config TREE_RCU
         bool "Tree-based hierarchical RCU"
+       depends on !PREEMPT && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@ -354,7 +357,7 @@ config TREE_RCU
           smaller systems.
  
  config TREE_PREEMPT_RCU
-       bool "Preemptable tree-based hierarchical RCU"
+       bool "Preemptible tree-based hierarchical RCU"
         depends on PREEMPT
         help
           This option selects the RCU implementation that is
@@ -372,8 +375,22 @@ config TINY_RCU
           is not required.  This option greatly reduces the
           memory footprint of RCU.
  
+config TINY_PREEMPT_RCU
+       bool "Preemptible UP-only small-memory-footprint RCU"
+       depends on !SMP && PREEMPT
+       help
+         This option selects the RCU implementation that is designed
+         for real-time UP systems.  This option greatly reduces the
+         memory footprint of RCU.
+
  endchoice
  
+config PREEMPT_RCU
+       def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+       help
+         This option enables preemptible-RCU code that is common between
+         the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+
  config RCU_TRACE
         bool "Enable tracing for RCU"
         depends on TREE_RCU || TREE_PREEMPT_RCU
@@ -394,9 +411,12 @@ config RCU_FANOUT
         help
           This option controls the fanout of hierarchical implementations
           of RCU, allowing RCU to work efficiently on machines with
-         large numbers of CPUs.  This value must be at least the cube
-         root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-         systems and up to 262,144 for 64-bit systems.
+         large numbers of CPUs.  This value must be at least the fourth
+         root of NR_CPUS, which allows NR_CPUS to be insanely large.
+         The default value of RCU_FANOUT should be used for production
+         systems, but if you are stress-testing the RCU implementation
+         itself, small RCU_FANOUT values allow you to test large-system
+         code paths on small(er) systems.
  
           Select a specific number if testing RCU itself.
           Take the default if unsure.
diff --git a/init/main.c b/init/main.c

index 94ab488..9684c96 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -556,7 +556,6 @@ asmlinkage void __init start_kernel(void)
  
         local_irq_disable();
         early_boot_irqs_off();
-       early_init_irq_lock_class();
  
  /*
   * Interrupts are still disabled. Do necessary setups, then
diff --git a/kernel/Makefile b/kernel/Makefile

index 4d9bf5f..e2c9d52 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
  obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
  obj-$(CONFIG_TINY_RCU) += rcutiny.o
+obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
  obj-$(CONFIG_RELAY) += relay.o
  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index c9483d8..291ba3d 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
          * is called after synchronize_rcu(). But for safe use, css_is_removed()
          * css_tryget() should be used for avoiding race.
          */
-       struct cgroup_subsys_state *css;
+       struct cgroup_subsys_state __rcu *css;
         /*
          * ID of this css.
          */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index b23c097..51b143e 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
         if (tsk->flags & PF_THREAD_BOUND)
                 return -EINVAL;
  
-       ret = security_task_setscheduler(tsk, 0, NULL);
+       ret = security_task_setscheduler(tsk);
         if (ret)
                 return ret;
         if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  
                 rcu_read_lock();
                 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                       ret = security_task_setscheduler(c, 0, NULL);
+                       ret = security_task_setscheduler(c);
                         if (ret) {
                                 rcu_read_unlock();
                                 return ret;
diff --git a/kernel/futex.c b/kernel/futex.c

index 6a3a5fa..a118bf1 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
  
  /**
   * struct futex_q - The hashed futex queue entry, one per waiting task
+ * @list:              priority-sorted list of tasks waiting on this futex
   * @task:              the task waiting on the futex
   * @lock_ptr:          the hash bucket lock
   * @key:               the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
   *
   * A futex_q has a woken state, just like tasks have TASK_RUNNING.
   * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
- * The order of wakup is always to make the first condition true, then
+ * The order of wakeup is always to make the first condition true, then
   * the second.
   *
   * PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
   * Slow path to fixup the fault we just took in the atomic write
   * access to @uaddr.
   *
- * We have no generic implementation of a non destructive write to the
+ * We have no generic implementation of a non-destructive write to the
   * user address. We know that we faulted in the atomic pagefault
   * disabled section so we can as well avoid the #PF overhead by
   * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                          */
                         pi_state = this->pi_state;
                         /*
-                        * Userspace might have messed up non PI and PI futexes
+                        * Userspace might have messed up non-PI and PI futexes
                          */
                         if (unlikely(!pi_state))
                                 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
  
         /*
          * We set q->lock_ptr = NULL _before_ we wake up the task. If
-        * a non futex wake up happens on another CPU then the task
-        * might exit and p would dereference a non existing task
+        * a non-futex wake up happens on another CPU then the task
+        * might exit and p would dereference a non-existing task
          * struct. Prevent this by holding a reference on p across the
          * wake up.
          */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  
  /**
   * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
- * uaddr1:     source futex user address
- * uaddr2:     target futex user address
- * nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
- * nr_requeue: number of waiters to requeue (0-INT_MAX)
- * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ * @uaddr1:    source futex user address
+ * @fshared:   0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @uaddr2:    target futex user address
+ * @nr_wake:   number of waiters to wake (must be 1 for requeue_pi)
+ * @nr_requeue:        number of waiters to requeue (0-INT_MAX)
+ * @cmpval:    @uaddr1 expected value (or %NULL)
+ * @requeue_pi:        if we are attempting to requeue from a non-pi futex to a
   *             pi futex (pi to pi requeue is not supported)
   *
   * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
  
  /* The key must be already stored in q->key. */
  static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+       __acquires(&hb->lock)
  {
         struct futex_hash_bucket *hb;
  
-       get_futex_key_refs(&q->key);
         hb = hash_futex(&q->key);
         q->lock_ptr = &hb->lock;
  
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
  
  static inline void
  queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+       __releases(&hb->lock)
  {
         spin_unlock(&hb->lock);
-       drop_futex_key_refs(&q->key);
  }
  
  /**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
   * an example).
   */
  static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+       __releases(&hb->lock)
  {
         int prio;
  
@@ -1471,6 +1475,7 @@ retry:
   * and dropped here.
   */
  static void unqueue_me_pi(struct futex_q *q)
+       __releases(q->lock_ptr)
  {
         WARN_ON(plist_node_empty(&q->list));
         plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
         q->pi_state = NULL;
  
         spin_unlock(q->lock_ptr);
-
-       drop_futex_key_refs(&q->key);
  }
  
  /*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
         }
  
  retry:
-       /* Prepare to wait on uaddr. */
+       /*
+        * Prepare to wait on uaddr. On success, holds hb lock and increments
+        * q.key refs.
+        */
         ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
         if (ret)
                 goto out;
@@ -1822,28 +1828,27 @@ retry:
  
         /* If we were woken (and unqueued), we succeeded, whatever. */
         ret = 0;
+       /* unqueue_me() drops q.key ref */
         if (!unqueue_me(&q))
-               goto out_put_key;
+               goto out;
         ret = -ETIMEDOUT;
         if (to && !to->task)
-               goto out_put_key;
+               goto out;
  
         /*
          * We expect signal_pending(current), but we might be the
          * victim of a spurious wakeup as well.
          */
-       if (!signal_pending(current)) {
-               put_futex_key(fshared, &q.key);
+       if (!signal_pending(current))
                 goto retry;
-       }
  
         ret = -ERESTARTSYS;
         if (!abs_time)
-               goto out_put_key;
+               goto out;
  
         restart = &current_thread_info()->restart_block;
         restart->fn = futex_wait_restart;
-       restart->futex.uaddr = (u32 *)uaddr;
+       restart->futex.uaddr = uaddr;
         restart->futex.val = val;
         restart->futex.time = abs_time->tv64;
         restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
  
         ret = -ERESTART_RESTARTBLOCK;
  
-out_put_key:
-       put_futex_key(fshared, &q.key);
  out:
         if (to) {
                 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
  
  static long futex_wait_restart(struct restart_block *restart)
  {
-       u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+       u32 __user *uaddr = restart->futex.uaddr;
         int fshared = 0;
         ktime_t t, *tp = NULL;
  
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         q.rt_waiter = &rt_waiter;
         q.requeue_pi_key = &key2;
  
-       /* Prepare to wait on uaddr. */
+       /*
+        * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+        * count.
+        */
         ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
         if (ret)
                 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
          * In order for us to be here, we know our q.key == key2, and since
          * we took the hb->lock above, we also know that futex_requeue() has
          * completed and we no longer have to concern ourselves with a wakeup
-        * race with the atomic proxy lock acquition by the requeue code.
+        * race with the atomic proxy lock acquisition by the requeue code. The
+        * futex_requeue dropped our key1 reference and incremented our key2
+        * reference count.
          */
  
         /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
   */
  static inline int fetch_robust_entry(struct robust_list __user **entry,
                                      struct robust_list __user * __user *head,
-                                    int *pi)
+                                    unsigned int *pi)
  {
         unsigned long uentry;
  
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
          * of the complex code paths. Also we want to prevent
          * registration of robust lists in that case. NULL is
          * guaranteed to fault and we get -EFAULT on functional
-        * implementation, the non functional ones will return
+        * implementation, the non-functional ones will return
          * -ENOSYS.
          */
         curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c

index d49afb2..06da4df 100644 (file)
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
   */
  static inline int
  fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-                  compat_uptr_t __user *head, int *pi)
+                  compat_uptr_t __user *head, unsigned int *pi)
  {
         if (get_user(*uentry, head))
                 return -EFAULT;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c

index 0c642d5..53ead17 100644 (file)
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
         printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
                         " disables this message.\n");
         sched_show_task(t);
-       __debug_show_held_locks(t);
+       debug_show_held_locks(t);
  
         touch_nmi_watchdog();
  
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
   * periodically exit the critical section and enter a new one.
   *
   * For preemptible RCU it is sufficient to call rcu_read_unlock in order
- * exit the grace period. For classic RCU, a reschedule is required.
+ * to exit the grace period. For classic RCU, a reschedule is required.
   */
  static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
  {
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig

new file mode 100644 (file)

index 0000000..31d766b
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
+config HAVE_GENERIC_HARDIRQS
+       def_bool n
+
+if HAVE_GENERIC_HARDIRQS
+menu "IRQ subsystem"
+#
+# Interrupt subsystem related configuration options
+#
+config GENERIC_HARDIRQS
+       def_bool y
+
+config GENERIC_HARDIRQS_NO__DO_IRQ
+       def_bool y
+
+# Select this to disable the deprecated stuff
+config GENERIC_HARDIRQS_NO_DEPRECATED
+       def_bool n
+
+# Options selectable by the architecture code
+config HAVE_SPARSE_IRQ
+       def_bool n
+
+config GENERIC_IRQ_PROBE
+       def_bool n
+
+config GENERIC_PENDING_IRQ
+       def_bool n
+
+config AUTO_IRQ_AFFINITY
+       def_bool n
+
+config IRQ_PER_CPU
+       def_bool n
+
+config HARDIRQS_SW_RESEND
+       def_bool n
+
+config SPARSE_IRQ
+       bool "Support sparse irq numbering"
+       depends on HAVE_SPARSE_IRQ
+       ---help---
+
+         Sparse irq numbering is useful for distro kernels that want
+         to define a high CONFIG_NR_CPUS value but still want to have
+         low kernel memory footprint on smaller machines.
+
+         ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
+           out the interrupt descriptors in a more NUMA-friendly way. )
+
+         If you don't know what to do here, say N.
+
+endmenu
+endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile

index 7d04780..54329cd 100644 (file)
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
  
-obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
  obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
  obj-$(CONFIG_PROC_FS) += proc.o
  obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
  obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c

index 2295a31..505798f 100644 (file)
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
                          * Some chips need to know about probing in
                          * progress:
                          */
-                       if (desc->chip->set_type)
-                               desc->chip->set_type(i, IRQ_TYPE_PROBE);
-                       desc->chip->startup(i);
+                       if (desc->irq_data.chip->irq_set_type)
+                               desc->irq_data.chip->irq_set_type(&desc->irq_data,
+                                                        IRQ_TYPE_PROBE);
+                       desc->irq_data.chip->irq_startup(&desc->irq_data);
                 }
                 raw_spin_unlock_irq(&desc->lock);
         }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
                 raw_spin_lock_irq(&desc->lock);
                 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                         desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
-                       if (desc->chip->startup(i))
+                       if (desc->irq_data.chip->irq_startup(&desc->irq_data))
                                 desc->status |= IRQ_PENDING;
                 }
                 raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
                         /* It triggered already - consider it spurious. */
                         if (!(status & IRQ_WAITING)) {
                                 desc->status = status & ~IRQ_AUTODETECT;
-                               desc->chip->shutdown(i);
+                               desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                         } else
                                 if (i < 32)
                                         mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
                                 mask |= 1 << i;
  
                         desc->status = status & ~IRQ_AUTODETECT;
-                       desc->chip->shutdown(i);
+                       desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                 }
                 raw_spin_unlock_irq(&desc->lock);
         }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
                                 nr_of_irqs++;
                         }
                         desc->status = status & ~IRQ_AUTODETECT;
-                       desc->chip->shutdown(i);
+                       desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                 }
                 raw_spin_unlock_irq(&desc->lock);
         }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c

index b7091d5..baa5c4a 100644 (file)
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
  
  #include "internals.h"
  
-static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
-{
-       struct irq_desc *desc;
-       unsigned long flags;
-
-       desc = irq_to_desc(irq);
-       if (!desc) {
-               WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
-               return;
-       }
-
-       /* Ensure we don't have left over values from a previous use of this irq */
-       raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->status = IRQ_DISABLED;
-       desc->chip = &no_irq_chip;
-       desc->handle_irq = handle_bad_irq;
-       desc->depth = 1;
-       desc->msi_desc = NULL;
-       desc->handler_data = NULL;
-       if (!keep_chip_data)
-               desc->chip_data = NULL;
-       desc->action = NULL;
-       desc->irq_count = 0;
-       desc->irqs_unhandled = 0;
-#ifdef CONFIG_SMP
-       cpumask_setall(desc->affinity);
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-       cpumask_clear(desc->pending_mask);
-#endif
-#endif
-       raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-/**
- *     dynamic_irq_init - initialize a dynamically allocated irq
- *     @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
-{
-       dynamic_irq_init_x(irq, false);
-}
-
-/**
- *     dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
- *     @irq:   irq number to initialize
- *
- *     does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_init_keep_chip_data(unsigned int irq)
-{
-       dynamic_irq_init_x(irq, true);
-}
-
-static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-       unsigned long flags;
-
-       if (!desc) {
-               WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
-               return;
-       }
-
-       raw_spin_lock_irqsave(&desc->lock, flags);
-       if (desc->action) {
-               raw_spin_unlock_irqrestore(&desc->lock, flags);
-               WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
-                       irq);
-               return;
-       }
-       desc->msi_desc = NULL;
-       desc->handler_data = NULL;
-       if (!keep_chip_data)
-               desc->chip_data = NULL;
-       desc->handle_irq = handle_bad_irq;
-       desc->chip = &no_irq_chip;
-       desc->name = NULL;
-       clear_kstat_irqs(desc);
-       raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-/**
- *     dynamic_irq_cleanup - cleanup a dynamically allocated irq
- *     @irq:   irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
-       dynamic_irq_cleanup_x(irq, false);
-}
-
-/**
- *     dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
- *     @irq:   irq number to initialize
- *
- *     does not set irq_to_desc(irq)->chip_data to NULL
- */
-void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
-{
-       dynamic_irq_cleanup_x(irq, true);
-}
-
-
  /**
   *     set_irq_chip - set the irq chip for an irq
   *     @irq:   irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
  
         raw_spin_lock_irqsave(&desc->lock, flags);
         irq_chip_set_defaults(chip);
-       desc->chip = chip;
+       desc->irq_data.chip = chip;
         raw_spin_unlock_irqrestore(&desc->lock, flags);
  
         return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
         }
  
         raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->handler_data = data;
+       desc->irq_data.handler_data = data;
         raw_spin_unlock_irqrestore(&desc->lock, flags);
         return 0;
  }
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
         }
  
         raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->msi_desc = entry;
+       desc->irq_data.msi_desc = entry;
         if (entry)
                 entry->irq = irq;
         raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
                 return -EINVAL;
         }
  
-       if (!desc->chip) {
+       if (!desc->irq_data.chip) {
                 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
                 return -EINVAL;
         }
  
         raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->chip_data = data;
+       desc->irq_data.chip_data = data;
         raw_spin_unlock_irqrestore(&desc->lock, flags);
  
         return 0;
  }
  EXPORT_SYMBOL(set_irq_chip_data);
  
+struct irq_data *irq_get_irq_data(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       return desc ? &desc->irq_data : NULL;
+}
+EXPORT_SYMBOL_GPL(irq_get_irq_data);
+
  /**
   *     set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
   *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
  /*
   * default enable function
   */
-static void default_enable(unsigned int irq)
+static void default_enable(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc = irq_data_to_desc(data);
  
-       desc->chip->unmask(irq);
+       desc->irq_data.chip->irq_unmask(&desc->irq_data);
         desc->status &= ~IRQ_MASKED;
  }
  
  /*
   * default disable function
   */
-static void default_disable(unsigned int irq)
+static void default_disable(struct irq_data *data)
  {
  }
  
  /*
   * default startup function
   */
-static unsigned int default_startup(unsigned int irq)
+static unsigned int default_startup(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc = irq_data_to_desc(data);
  
-       desc->chip->enable(irq);
+       desc->irq_data.chip->irq_enable(data);
         return 0;
  }
  
  /*
   * default shutdown function
   */
-static void default_shutdown(unsigned int irq)
+static void default_shutdown(struct irq_data *data)
  {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc = irq_data_to_desc(data);
  
-       desc->chip->mask(irq);
+       desc->irq_data.chip->irq_mask(&desc->irq_data);
         desc->status |= IRQ_MASKED;
  }
  
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+/* Temporary migration helpers */
+static void compat_irq_mask(struct irq_data *data)
+{
+       data->chip->mask(data->irq);
+}
+
+static void compat_irq_unmask(struct irq_data *data)
+{
+       data->chip->unmask(data->irq);
+}
+
+static void compat_irq_ack(struct irq_data *data)
+{
+       data->chip->ack(data->irq);
+}
+
+static void compat_irq_mask_ack(struct irq_data *data)
+{
+       data->chip->mask_ack(data->irq);
+}
+
+static void compat_irq_eoi(struct irq_data *data)
+{
+       data->chip->eoi(data->irq);
+}
+
+static void compat_irq_enable(struct irq_data *data)
+{
+       data->chip->enable(data->irq);
+}
+
+static void compat_irq_disable(struct irq_data *data)
+{
+       data->chip->disable(data->irq);
+}
+
+static void compat_irq_shutdown(struct irq_data *data)
+{
+       data->chip->shutdown(data->irq);
+}
+
+static unsigned int compat_irq_startup(struct irq_data *data)
+{
+       return data->chip->startup(data->irq);
+}
+
+static int compat_irq_set_affinity(struct irq_data *data,
+                                  const struct cpumask *dest, bool force)
+{
+       return data->chip->set_affinity(data->irq, dest);
+}
+
+static int compat_irq_set_type(struct irq_data *data, unsigned int type)
+{
+       return data->chip->set_type(data->irq, type);
+}
+
+static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
+{
+       return data->chip->set_wake(data->irq, on);
+}
+
+static int compat_irq_retrigger(struct irq_data *data)
+{
+       return data->chip->retrigger(data->irq);
+}
+
+static void compat_bus_lock(struct irq_data *data)
+{
+       data->chip->bus_lock(data->irq);
+}
+
+static void compat_bus_sync_unlock(struct irq_data *data)
+{
+       data->chip->bus_sync_unlock(data->irq);
+}
+#endif
+
  /*
   * Fixup enable/disable function pointers
   */
  void irq_chip_set_defaults(struct irq_chip *chip)
  {
-       if (!chip->enable)
-               chip->enable = default_enable;
-       if (!chip->disable)
-               chip->disable = default_disable;
-       if (!chip->startup)
-               chip->startup = default_startup;
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
         /*
-        * We use chip->disable, when the user provided its own. When
-        * we have default_disable set for chip->disable, then we need
+        * Compat fixup functions need to be before we set the
+        * defaults for enable/disable/startup/shutdown
+        */
+       if (chip->enable)
+               chip->irq_enable = compat_irq_enable;
+       if (chip->disable)
+               chip->irq_disable = compat_irq_disable;
+       if (chip->shutdown)
+               chip->irq_shutdown = compat_irq_shutdown;
+       if (chip->startup)
+               chip->irq_startup = compat_irq_startup;
+#endif
+       /*
+        * The real defaults
+        */
+       if (!chip->irq_enable)
+               chip->irq_enable = default_enable;
+       if (!chip->irq_disable)
+               chip->irq_disable = default_disable;
+       if (!chip->irq_startup)
+               chip->irq_startup = default_startup;
+       /*
+        * We use chip->irq_disable, when the user provided its own. When
+        * we have default_disable set for chip->irq_disable, then we need
          * to use default_shutdown, otherwise the irq line is not
          * disabled on free_irq():
          */
-       if (!chip->shutdown)
-               chip->shutdown = chip->disable != default_disable ?
-                       chip->disable : default_shutdown;
-       if (!chip->name)
-               chip->name = chip->typename;
+       if (!chip->irq_shutdown)
+               chip->irq_shutdown = chip->irq_disable != default_disable ?
+                       chip->irq_disable : default_shutdown;
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
         if (!chip->end)
                 chip->end = dummy_irq_chip.end;
+
+       /*
+        * Now fix up the remaining compat handlers
+        */
+       if (chip->bus_lock)
+               chip->irq_bus_lock = compat_bus_lock;
+       if (chip->bus_sync_unlock)
+               chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
+       if (chip->mask)
+               chip->irq_mask = compat_irq_mask;
+       if (chip->unmask)
+               chip->irq_unmask = compat_irq_unmask;
+       if (chip->ack)
+               chip->irq_ack = compat_irq_ack;
+       if (chip->mask_ack)
+               chip->irq_mask_ack = compat_irq_mask_ack;
+       if (chip->eoi)
+               chip->irq_eoi = compat_irq_eoi;
+       if (chip->set_affinity)
+               chip->irq_set_affinity = compat_irq_set_affinity;
+       if (chip->set_type)
+               chip->irq_set_type = compat_irq_set_type;
+       if (chip->set_wake)
+               chip->irq_set_wake = compat_irq_set_wake;
+       if (chip->retrigger)
+               chip->irq_retrigger = compat_irq_retrigger;
+#endif
  }
  
-static inline void mask_ack_irq(struct irq_desc *desc, int irq)
+static inline void mask_ack_irq(struct irq_desc *desc)
  {
-       if (desc->chip->mask_ack)
-               desc->chip->mask_ack(irq);
+       if (desc->irq_data.chip->irq_mask_ack)
+               desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
         else {
-               desc->chip->mask(irq);
-               if (desc->chip->ack)
-                       desc->chip->ack(irq);
+               desc->irq_data.chip->irq_mask(&desc->irq_data);
+               if (desc->irq_data.chip->irq_ack)
+                       desc->irq_data.chip->irq_ack(&desc->irq_data);
         }
         desc->status |= IRQ_MASKED;
  }
  
-static inline void mask_irq(struct irq_desc *desc, int irq)
+static inline void mask_irq(struct irq_desc *desc)
  {
-       if (desc->chip->mask) {
-               desc->chip->mask(irq);
+       if (desc->irq_data.chip->irq_mask) {
+               desc->irq_data.chip->irq_mask(&desc->irq_data);
                 desc->status |= IRQ_MASKED;
         }
  }
  
-static inline void unmask_irq(struct irq_desc *desc, int irq)
+static inline void unmask_irq(struct irq_desc *desc)
  {
-       if (desc->chip->unmask) {
-               desc->chip->unmask(irq);
+       if (desc->irq_data.chip->irq_unmask) {
+               desc->irq_data.chip->irq_unmask(&desc->irq_data);
                 desc->status &= ~IRQ_MASKED;
         }
  }
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
         irqreturn_t action_ret;
  
         raw_spin_lock(&desc->lock);
-       mask_ack_irq(desc, irq);
+       mask_ack_irq(desc);
  
         if (unlikely(desc->status & IRQ_INPROGRESS))
                 goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
         desc->status &= ~IRQ_INPROGRESS;
  
         if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-               unmask_irq(desc, irq);
+               unmask_irq(desc);
  out_unlock:
         raw_spin_unlock(&desc->lock);
  }
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
         action = desc->action;
         if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                 desc->status |= IRQ_PENDING;
-               mask_irq(desc, irq);
+               mask_irq(desc);
                 goto out;
         }
  
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
         raw_spin_lock(&desc->lock);
         desc->status &= ~IRQ_INPROGRESS;
  out:
-       desc->chip->eoi(irq);
+       desc->irq_data.chip->irq_eoi(&desc->irq_data);
  
         raw_spin_unlock(&desc->lock);
  }
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
         if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
                     !desc->action)) {
                 desc->status |= (IRQ_PENDING | IRQ_MASKED);
-               mask_ack_irq(desc, irq);
+               mask_ack_irq(desc);
                 goto out_unlock;
         }
         kstat_incr_irqs_this_cpu(irq, desc);
  
         /* Start handling the irq */
-       if (desc->chip->ack)
-               desc->chip->ack(irq);
+       desc->irq_data.chip->irq_ack(&desc->irq_data);
  
         /* Mark the IRQ currently in progress.*/
         desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                 irqreturn_t action_ret;
  
                 if (unlikely(!action)) {
-                       mask_irq(desc, irq);
+                       mask_irq(desc);
                         goto out_unlock;
                 }
  
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                 if (unlikely((desc->status &
                                (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                               (IRQ_PENDING | IRQ_MASKED))) {
-                       unmask_irq(desc, irq);
+                       unmask_irq(desc);
                 }
  
                 desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
  
         kstat_incr_irqs_this_cpu(irq, desc);
  
-       if (desc->chip->ack)
-               desc->chip->ack(irq);
+       if (desc->irq_data.chip->irq_ack)
+               desc->irq_data.chip->irq_ack(&desc->irq_data);
  
         action_ret = handle_IRQ_event(irq, desc->action);
         if (!noirqdebug)
                 note_interrupt(irq, desc, action_ret);
  
-       if (desc->chip->eoi)
-               desc->chip->eoi(irq);
+       if (desc->irq_data.chip->irq_eoi)
+               desc->irq_data.chip->irq_eoi(&desc->irq_data);
  }
  
  void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
  
         if (!handle)
                 handle = handle_bad_irq;
-       else if (desc->chip == &no_irq_chip) {
+       else if (desc->irq_data.chip == &no_irq_chip) {
                 printk(KERN_WARNING "Trying to install %sinterrupt handler "
                        "for IRQ%d\n", is_chained ? "chained " : "", irq);
                 /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  * prevent us to setup the interrupt at all. Switch it to
                  * dummy_irq_chip for easy transition.
                  */
-               desc->chip = &dummy_irq_chip;
+               desc->irq_data.chip = &dummy_irq_chip;
         }
  
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         raw_spin_lock_irqsave(&desc->lock, flags);
  
         /* Uninstall? */
         if (handle == handle_bad_irq) {
-               if (desc->chip != &no_irq_chip)
-                       mask_ack_irq(desc, irq);
+               if (desc->irq_data.chip != &no_irq_chip)
+                       mask_ack_irq(desc);
                 desc->status |= IRQ_DISABLED;
                 desc->depth = 1;
         }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                 desc->status &= ~IRQ_DISABLED;
                 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
                 desc->depth = 0;
-               desc->chip->startup(irq);
+               desc->irq_data.chip->irq_startup(&desc->irq_data);
         }
         raw_spin_unlock_irqrestore(&desc->lock, flags);
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  }
  EXPORT_SYMBOL_GPL(__set_irq_handler);
  
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
         __set_irq_handler(irq, handle, 0, name);
  }
  
-void set_irq_noprobe(unsigned int irq)
+void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
  {
         struct irq_desc *desc = irq_to_desc(irq);
         unsigned long flags;
  
-       if (!desc) {
-               printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+       if (!desc)
                 return;
-       }
-
-       raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->status |= IRQ_NOPROBE;
-       raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-
-void set_irq_probe(unsigned int irq)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-       unsigned long flags;
  
-       if (!desc) {
-               printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
-               return;
-       }
+       /* Sanitize flags */
+       set &= IRQF_MODIFY_MASK;
+       clr &= IRQF_MODIFY_MASK;
  
         raw_spin_lock_irqsave(&desc->lock, flags);
-       desc->status &= ~IRQ_NOPROBE;
+       desc->status &= ~clr;
+       desc->status |= set;
         raw_spin_unlock_irqrestore(&desc->lock, flags);
  }
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c

new file mode 100644 (file)

index 0000000..20dc547
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the dummy interrupt chip implementation
+ */
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+
+#include "internals.h"
+
+/*
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
+ */
+static void ack_bad(struct irq_data *data)
+{
+       struct irq_desc *desc = irq_data_to_desc(data);
+
+       print_irq_desc(data->irq, desc);
+       ack_bad_irq(data->irq);
+}
+
+/*
+ * NOP functions
+ */
+static void noop(struct irq_data *data) { }
+
+static unsigned int noop_ret(struct irq_data *data)
+{
+       return 0;
+}
+
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static void compat_noop(unsigned int irq) { }
+#define END_INIT .end = compat_noop
+#else
+#define END_INIT
+#endif
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_chip no_irq_chip = {
+       .name           = "none",
+       .irq_startup    = noop_ret,
+       .irq_shutdown   = noop,
+       .irq_enable     = noop,
+       .irq_disable    = noop,
+       .irq_ack        = ack_bad,
+       END_INIT
+};
+
+/*
+ * Generic dummy implementation which can be used for
+ * real dumb interrupt sources
+ */
+struct irq_chip dummy_irq_chip = {
+       .name           = "dummy",
+       .irq_startup    = noop_ret,
+       .irq_shutdown   = noop,
+       .irq_enable     = noop,
+       .irq_disable    = noop,
+       .irq_ack        = noop,
+       .irq_mask       = noop,
+       .irq_unmask     = noop,
+       END_INIT
+};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c

index 27e5c69..e2347eb 100644 (file)
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
   */
  
  #include <linux/irq.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/module.h>
  #include <linux/random.h>
+#include <linux/sched.h>
  #include <linux/interrupt.h>
  #include <linux/kernel_stat.h>
-#include <linux/rculist.h>
-#include <linux/hash.h>
-#include <linux/radix-tree.h>
+
  #include <trace/events/irq.h>
  
  #include "internals.h"
  
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-struct lock_class_key irq_desc_lock_class;
-
  /**
   * handle_bad_irq - handle spurious and unhandled irqs
   * @irq:       the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
         ack_bad_irq(irq);
  }
  
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-static void __init init_irq_default_affinity(void)
-{
-       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-       cpumask_setall(irq_default_affinity);
-}
-#else
-static void __init init_irq_default_affinity(void)
-{
-}
-#endif
-
-/*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the appropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
- */
-int nr_irqs = NR_IRQS;
-EXPORT_SYMBOL_GPL(nr_irqs);
-
-#ifdef CONFIG_SPARSE_IRQ
-
-static struct irq_desc irq_desc_init = {
-       .irq        = -1,
-       .status     = IRQ_DISABLED,
-       .chip       = &no_irq_chip,
-       .handle_irq = handle_bad_irq,
-       .depth      = 1,
-       .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-};
-
-void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
-{
-       void *ptr;
-
-       ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                          GFP_ATOMIC, node);
-
-       /*
-        * don't overwite if can not get new one
-        * init_copy_kstat_irqs() could still use old one
-        */
-       if (ptr) {
-               printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
-               desc->kstat_irqs = ptr;
-       }
-}
-
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
-{
-       memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
-
-       raw_spin_lock_init(&desc->lock);
-       desc->irq = irq;
-#ifdef CONFIG_SMP
-       desc->node = node;
-#endif
-       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_kstat_irqs(desc, node, nr_cpu_ids);
-       if (!desc->kstat_irqs) {
-               printk(KERN_ERR "can not alloc kstat_irqs\n");
-               BUG_ON(1);
-       }
-       if (!alloc_desc_masks(desc, node, false)) {
-               printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-               BUG_ON(1);
-       }
-       init_desc_masks(desc);
-       arch_init_chip_data(desc, node);
-}
-
-/*
- * Protect the sparse_irqs:
- */
-DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-
-static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
-
-static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-       radix_tree_insert(&irq_desc_tree, irq, desc);
-}
-
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-       return radix_tree_lookup(&irq_desc_tree, irq);
-}
-
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
-{
-       void **ptr;
-
-       ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
-       if (ptr)
-               radix_tree_replace_slot(ptr, desc);
-}
-
-static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
-       [0 ... NR_IRQS_LEGACY-1] = {
-               .irq        = -1,
-               .status     = IRQ_DISABLED,
-               .chip       = &no_irq_chip,
-               .handle_irq = handle_bad_irq,
-               .depth      = 1,
-               .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-       }
-};
-
-static unsigned int *kstat_irqs_legacy;
-
-int __init early_irq_init(void)
-{
-       struct irq_desc *desc;
-       int legacy_count;
-       int node;
-       int i;
-
-       init_irq_default_affinity();
-
-        /* initialize nr_irqs based on nr_cpu_ids */
-       arch_probe_nr_irqs();
-       printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
-
-       desc = irq_desc_legacy;
-       legacy_count = ARRAY_SIZE(irq_desc_legacy);
-       node = first_online_node;
-
-       /* allocate based on nr_cpu_ids */
-       kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-                                         sizeof(int), GFP_NOWAIT, node);
-
-       for (i = 0; i < legacy_count; i++) {
-               desc[i].irq = i;
-#ifdef CONFIG_SMP
-               desc[i].node = node;
-#endif
-               desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
-               lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-               alloc_desc_masks(&desc[i], node, true);
-               init_desc_masks(&desc[i]);
-               set_irq_desc(i, &desc[i]);
-       }
-
-       return arch_early_irq_init();
-}
-
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-       struct irq_desc *desc;
-       unsigned long flags;
-
-       if (irq >= nr_irqs) {
-               WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
-                       irq, nr_irqs);
-               return NULL;
-       }
-
-       desc = irq_to_desc(irq);
-       if (desc)
-               return desc;
-
-       raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-
-       /* We have to check it to avoid races with another CPU */
-       desc = irq_to_desc(irq);
-       if (desc)
-               goto out_unlock;
-
-       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-
-       printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
-       if (!desc) {
-               printk(KERN_ERR "can not alloc irq_desc\n");
-               BUG_ON(1);
-       }
-       init_one_irq_desc(irq, desc, node);
-
-       set_irq_desc(irq, desc);
-
-out_unlock:
-       raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
-       return desc;
-}
-
-#else /* !CONFIG_SPARSE_IRQ */
-
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
-       [0 ... NR_IRQS-1] = {
-               .status = IRQ_DISABLED,
-               .chip = &no_irq_chip,
-               .handle_irq = handle_bad_irq,
-               .depth = 1,
-               .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
-       }
-};
-
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
-int __init early_irq_init(void)
-{
-       struct irq_desc *desc;
-       int count;
-       int i;
-
-       init_irq_default_affinity();
-
-       printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
-
-       desc = irq_desc;
-       count = ARRAY_SIZE(irq_desc);
-
-       for (i = 0; i < count; i++) {
-               desc[i].irq = i;
-               alloc_desc_masks(&desc[i], 0, true);
-               init_desc_masks(&desc[i]);
-               desc[i].kstat_irqs = kstat_irqs_all[i];
-       }
-       return arch_early_irq_init();
-}
-
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-       return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
-       return irq_to_desc(irq);
-}
-#endif /* !CONFIG_SPARSE_IRQ */
-
-void clear_kstat_irqs(struct irq_desc *desc)
-{
-       memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
-}
-
-/*
- * What should we do if we get a hw irq event on an illegal vector?
- * Each architecture has to answer this themself.
- */
-static void ack_bad(unsigned int irq)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-
-       print_irq_desc(irq, desc);
-       ack_bad_irq(irq);
-}
-
-/*
- * NOP functions
- */
-static void noop(unsigned int irq)
-{
-}
-
-static unsigned int noop_ret(unsigned int irq)
-{
-       return 0;
-}
-
-/*
- * Generic no controller implementation
- */
-struct irq_chip no_irq_chip = {
-       .name           = "none",
-       .startup        = noop_ret,
-       .shutdown       = noop,
-       .enable         = noop,
-       .disable        = noop,
-       .ack            = ack_bad,
-       .end            = noop,
-};
-
-/*
- * Generic dummy implementation which can be used for
- * real dumb interrupt sources
- */
-struct irq_chip dummy_irq_chip = {
-       .name           = "dummy",
-       .startup        = noop_ret,
-       .shutdown       = noop,
-       .enable         = noop,
-       .disable        = noop,
-       .ack            = noop,
-       .mask           = noop,
-       .unmask         = noop,
-       .end            = noop,
-};
-
  /*
   * Special, empty irq handler:
   */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
                 /*
                  * No locking required for CPU-local interrupts:
                  */
-               if (desc->chip->ack)
-                       desc->chip->ack(irq);
+               if (desc->irq_data.chip->ack)
+                       desc->irq_data.chip->ack(irq);
                 if (likely(!(desc->status & IRQ_DISABLED))) {
                         action_ret = handle_IRQ_event(irq, desc->action);
                         if (!noirqdebug)
                                 note_interrupt(irq, desc, action_ret);
                 }
-               desc->chip->end(irq);
+               desc->irq_data.chip->end(irq);
                 return 1;
         }
  
         raw_spin_lock(&desc->lock);
-       if (desc->chip->ack)
-               desc->chip->ack(irq);
+       if (desc->irq_data.chip->ack)
+               desc->irq_data.chip->ack(irq);
         /*
          * REPLAY is when Linux resends an IRQ that was dropped earlier
          * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
          * The ->end() handler has to deal with interrupts which got
          * disabled while the handler was running.
          */
-       desc->chip->end(irq);
+       desc->irq_data.chip->end(irq);
         raw_spin_unlock(&desc->lock);
  
         return 1;
  }
  #endif
-
-void early_init_irq_lock_class(void)
-{
-       struct irq_desc *desc;
-       int i;
-
-       for_each_irq_desc(i, desc) {
-               lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       }
-}
-
-unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-       return desc ? desc->kstat_irqs[cpu] : 0;
-}
-EXPORT_SYMBOL(kstat_irqs_cpu);
-
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h

index c63f3bc..4571ae7 100644 (file)
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
  /*
   * IRQ subsystem internal functions and variables:
   */
+#include <linux/irqdesc.h>
  
  extern int noirqdebug;
  
+#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+
  /* Set default functions for irq_chip structures: */
  extern void irq_chip_set_defaults(struct irq_chip *chip);
  
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
  extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
  extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
  
-extern struct lock_class_key irq_desc_lock_class;
  extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-extern void clear_kstat_irqs(struct irq_desc *desc);
-extern raw_spinlock_t sparse_irq_lock;
  
-#ifdef CONFIG_SPARSE_IRQ
-void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-#endif
+/* Resending of interrupts :*/
+void check_irq_resend(struct irq_desc *desc, unsigned int irq);
  
  #ifdef CONFIG_PROC_FS
  extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
+extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
  extern void register_handler_proc(unsigned int irq, struct irqaction *action);
  extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
  #else
  static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
+static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
  static inline void register_handler_proc(unsigned int irq,
                                          struct irqaction *action) { }
  static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
  
  extern void irq_set_thread_affinity(struct irq_desc *desc);
  
+#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
+static inline void irq_end(unsigned int irq, struct irq_desc *desc)
+{
+       if (desc->irq_data.chip && desc->irq_data.chip->end)
+               desc->irq_data.chip->end(irq);
+}
+#else
+static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
+#endif
+
  /* Inline functions for support of irq chips on slow busses */
-static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_lock(struct irq_desc *desc)
  {
-       if (unlikely(desc->chip->bus_lock))
-               desc->chip->bus_lock(irq);
+       if (unlikely(desc->irq_data.chip->irq_bus_lock))
+               desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
  }
  
-static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
+static inline void chip_bus_sync_unlock(struct irq_desc *desc)
  {
-       if (unlikely(desc->chip->bus_sync_unlock))
-               desc->chip->bus_sync_unlock(irq);
+       if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
+               desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
  }
  
  /*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
                 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
         printk("->handle_irq():  %p, ", desc->handle_irq);
         print_symbol("%s\n", (unsigned long)desc->handle_irq);
-       printk("->chip(): %p, ", desc->chip);
-       print_symbol("%s\n", (unsigned long)desc->chip);
+       printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+       print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
         printk("->action(): %p\n", desc->action);
         if (desc->action) {
                 printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c

new file mode 100644 (file)

index 0000000..9d917ff
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the interrupt descriptor management code
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/radix-tree.h>
+#include <linux/bitmap.h>
+
+#include "internals.h"
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+       alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+       cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+{
+       if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+               return -ENOMEM;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+               free_cpumask_var(desc->irq_data.affinity);
+               return -ENOMEM;
+       }
+#endif
+       return 0;
+}
+
+static void desc_smp_init(struct irq_desc *desc, int node)
+{
+       desc->irq_data.node = node;
+       cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_clear(desc->pending_mask);
+#endif
+}
+
+static inline int desc_node(struct irq_desc *desc)
+{
+       return desc->irq_data.node;
+}
+
+#else
+static inline int
+alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+static inline void desc_smp_init(struct irq_desc *desc, int node) { }
+static inline int desc_node(struct irq_desc *desc) { return 0; }
+#endif
+
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+{
+       desc->irq_data.irq = irq;
+       desc->irq_data.chip = &no_irq_chip;
+       desc->irq_data.chip_data = NULL;
+       desc->irq_data.handler_data = NULL;
+       desc->irq_data.msi_desc = NULL;
+       desc->status = IRQ_DEFAULT_INIT_FLAGS;
+       desc->handle_irq = handle_bad_irq;
+       desc->depth = 1;
+       desc->irq_count = 0;
+       desc->irqs_unhandled = 0;
+       desc->name = NULL;
+       memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+       desc_smp_init(desc, node);
+}
+
+int nr_irqs = NR_IRQS;
+EXPORT_SYMBOL_GPL(nr_irqs);
+
+static DEFINE_MUTEX(sparse_irq_lock);
+static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+
+#ifdef CONFIG_SPARSE_IRQ
+
+static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
+
+static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
+{
+       radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+       return radix_tree_lookup(&irq_desc_tree, irq);
+}
+
+static void delete_irq_desc(unsigned int irq)
+{
+       radix_tree_delete(&irq_desc_tree, irq);
+}
+
+#ifdef CONFIG_SMP
+static void free_masks(struct irq_desc *desc)
+{
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       free_cpumask_var(desc->pending_mask);
+#endif
+       free_cpumask_var(desc->irq_data.affinity);
+}
+#else
+static inline void free_masks(struct irq_desc *desc) { }
+#endif
+
+static struct irq_desc *alloc_desc(int irq, int node)
+{
+       struct irq_desc *desc;
+       gfp_t gfp = GFP_KERNEL;
+
+       desc = kzalloc_node(sizeof(*desc), gfp, node);
+       if (!desc)
+               return NULL;
+       /* allocate based on nr_cpu_ids */
+       desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
+                                        gfp, node);
+       if (!desc->kstat_irqs)
+               goto err_desc;
+
+       if (alloc_masks(desc, gfp, node))
+               goto err_kstat;
+
+       raw_spin_lock_init(&desc->lock);
+       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+
+       desc_set_defaults(irq, desc, node);
+
+       return desc;
+
+err_kstat:
+       kfree(desc->kstat_irqs);
+err_desc:
+       kfree(desc);
+       return NULL;
+}
+
+static void free_desc(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       unregister_irq_proc(irq, desc);
+
+       mutex_lock(&sparse_irq_lock);
+       delete_irq_desc(irq);
+       mutex_unlock(&sparse_irq_lock);
+
+       free_masks(desc);
+       kfree(desc->kstat_irqs);
+       kfree(desc);
+}
+
+static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < cnt; i++) {
+               desc = alloc_desc(start + i, node);
+               if (!desc)
+                       goto err;
+               mutex_lock(&sparse_irq_lock);
+               irq_insert_desc(start + i, desc);
+               mutex_unlock(&sparse_irq_lock);
+       }
+       return start;
+
+err:
+       for (i--; i >= 0; i--)
+               free_desc(start + i);
+
+       mutex_lock(&sparse_irq_lock);
+       bitmap_clear(allocated_irqs, start, cnt);
+       mutex_unlock(&sparse_irq_lock);
+       return -ENOMEM;
+}
+
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+       int res = irq_alloc_descs(irq, irq, 1, node);
+
+       if (res == -EEXIST || res == irq)
+               return irq_to_desc(irq);
+       return NULL;
+}
+
+int __init early_irq_init(void)
+{
+       int i, initcnt, node = first_online_node;
+       struct irq_desc *desc;
+
+       init_irq_default_affinity();
+
+       /* Let arch update nr_irqs and return the nr of preallocated irqs */
+       initcnt = arch_probe_nr_irqs();
+       printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+
+       for (i = 0; i < initcnt; i++) {
+               desc = alloc_desc(i, node);
+               set_bit(i, allocated_irqs);
+               irq_insert_desc(i, desc);
+       }
+       return arch_early_irq_init();
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
+       [0 ... NR_IRQS-1] = {
+               .status         = IRQ_DEFAULT_INIT_FLAGS,
+               .handle_irq     = handle_bad_irq,
+               .depth          = 1,
+               .lock           = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
+       }
+};
+
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
+int __init early_irq_init(void)
+{
+       int count, i, node = first_online_node;
+       struct irq_desc *desc;
+
+       init_irq_default_affinity();
+
+       printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
+       desc = irq_desc;
+       count = ARRAY_SIZE(irq_desc);
+
+       for (i = 0; i < count; i++) {
+               desc[i].irq_data.irq = i;
+               desc[i].irq_data.chip = &no_irq_chip;
+               desc[i].kstat_irqs = kstat_irqs_all[i];
+               alloc_masks(desc + i, GFP_KERNEL, node);
+               desc_smp_init(desc + i, node);
+               lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+       }
+       return arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+       return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+{
+       return irq_to_desc(irq);
+}
+
+static void free_desc(unsigned int irq)
+{
+       dynamic_irq_cleanup(irq);
+}
+
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+{
+       return start;
+}
+#endif /* !CONFIG_SPARSE_IRQ */
+
+/* Dynamic interrupt handling */
+
+/**
+ * irq_free_descs - free irq descriptors
+ * @from:      Start of descriptor range
+ * @cnt:       Number of consecutive irqs to free
+ */
+void irq_free_descs(unsigned int from, unsigned int cnt)
+{
+       int i;
+
+       if (from >= nr_irqs || (from + cnt) > nr_irqs)
+               return;
+
+       for (i = 0; i < cnt; i++)
+               free_desc(from + i);
+
+       mutex_lock(&sparse_irq_lock);
+       bitmap_clear(allocated_irqs, from, cnt);
+       mutex_unlock(&sparse_irq_lock);
+}
+
+/**
+ * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * @irq:       Allocate for specific irq number if irq >= 0
+ * @from:      Start the search from this irq number
+ * @cnt:       Number of consecutive irqs to allocate.
+ * @node:      Preferred node on which the irq descriptor should be allocated
+ *
+ * Returns the first irq number or error code
+ */
+int __ref
+irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+{
+       int start, ret;
+
+       if (!cnt)
+               return -EINVAL;
+
+       mutex_lock(&sparse_irq_lock);
+
+       start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+       ret = -EEXIST;
+       if (irq >=0 && start != irq)
+               goto err;
+
+       ret = -ENOMEM;
+       if (start >= nr_irqs)
+               goto err;
+
+       bitmap_set(allocated_irqs, start, cnt);
+       mutex_unlock(&sparse_irq_lock);
+       return alloc_descs(start, cnt, node);
+
+err:
+       mutex_unlock(&sparse_irq_lock);
+       return ret;
+}
+
+/**
+ * irq_reserve_irqs - mark irqs allocated
+ * @from:      mark from irq number
+ * @cnt:       number of irqs to mark
+ *
+ * Returns 0 on success or an appropriate error code
+ */
+int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+{
+       unsigned int start;
+       int ret = 0;
+
+       if (!cnt || (from + cnt) > nr_irqs)
+               return -EINVAL;
+
+       mutex_lock(&sparse_irq_lock);
+       start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+       if (start == from)
+               bitmap_set(allocated_irqs, start, cnt);
+       else
+               ret = -EEXIST;
+       mutex_unlock(&sparse_irq_lock);
+       return ret;
+}
+
+/**
+ * irq_get_next_irq - get next allocated irq number
+ * @offset:    where to start the search
+ *
+ * Returns next irq number after offset or nr_irqs if none is found.
+ */
+unsigned int irq_get_next_irq(unsigned int offset)
+{
+       return find_next_bit(allocated_irqs, nr_irqs, offset);
+}
+
+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq:       irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&desc->lock, flags);
+       desc_set_defaults(irq, desc, desc_node(desc));
+       raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+       return desc ? desc->kstat_irqs[cpu] : 0;
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index c3003e9..644e8d5 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
  {
         struct irq_desc *desc = irq_to_desc(irq);
  
-       if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
-           !desc->chip->set_affinity)
+       if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
+           !desc->irq_data.chip->irq_set_affinity)
                 return 0;
  
         return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
  int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
  {
         struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_chip *chip = desc->irq_data.chip;
         unsigned long flags;
  
-       if (!desc->chip->set_affinity)
+       if (!chip->irq_set_affinity)
                 return -EINVAL;
  
         raw_spin_lock_irqsave(&desc->lock, flags);
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
         if (desc->status & IRQ_MOVE_PCNTXT) {
-               if (!desc->chip->set_affinity(irq, cpumask)) {
-                       cpumask_copy(desc->affinity, cpumask);
+               if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
+                       cpumask_copy(desc->irq_data.affinity, cpumask);
                         irq_set_thread_affinity(desc);
                 }
         }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                 cpumask_copy(desc->pending_mask, cpumask);
         }
  #else
-       if (!desc->chip->set_affinity(irq, cpumask)) {
-               cpumask_copy(desc->affinity, cpumask);
+       if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
+               cpumask_copy(desc->irq_data.affinity, cpumask);
                 irq_set_thread_affinity(desc);
         }
  #endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
          * one of the targets is online.
          */
         if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-               if (cpumask_any_and(desc->affinity, cpu_online_mask)
+               if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
                     < nr_cpu_ids)
                         goto set_affinity;
                 else
                         desc->status &= ~IRQ_AFFINITY_SET;
         }
  
-       cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
+       cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
  set_affinity:
-       desc->chip->set_affinity(irq, desc->affinity);
+       desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
  
         return 0;
  }
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
  
         if (!desc->depth++) {
                 desc->status |= IRQ_DISABLED;
-               desc->chip->disable(irq);
+               desc->irq_data.chip->irq_disable(&desc->irq_data);
         }
  }
  
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
         if (!desc)
                 return;
  
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         raw_spin_lock_irqsave(&desc->lock, flags);
         __disable_irq(desc, irq, false);
         raw_spin_unlock_irqrestore(&desc->lock, flags);
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  }
  EXPORT_SYMBOL(disable_irq_nosync);
  
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
   *     IRQ line is re-enabled.
   *
   *     This function may be called from IRQ context only when
- *     desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ *     desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
   */
  void enable_irq(unsigned int irq)
  {
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
         if (!desc)
                 return;
  
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         raw_spin_lock_irqsave(&desc->lock, flags);
         __enable_irq(desc, irq, false);
         raw_spin_unlock_irqrestore(&desc->lock, flags);
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  }
  EXPORT_SYMBOL(enable_irq);
  
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
         struct irq_desc *desc = irq_to_desc(irq);
         int ret = -ENXIO;
  
-       if (desc->chip->set_wake)
-               ret = desc->chip->set_wake(irq, on);
+       if (desc->irq_data.chip->irq_set_wake)
+               ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
  
         return ret;
  }
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
  }
  
  int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
-               unsigned long flags)
+                     unsigned long flags)
  {
         int ret;
-       struct irq_chip *chip = desc->chip;
+       struct irq_chip *chip = desc->irq_data.chip;
  
-       if (!chip || !chip->set_type) {
+       if (!chip || !chip->irq_set_type) {
                 /*
                  * IRQF_TRIGGER_* but the PIC does not support multiple
                  * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
         }
  
         /* caller masked out all except trigger mode flags */
-       ret = chip->set_type(irq, flags);
+       ret = chip->irq_set_type(&desc->irq_data, flags);
  
         if (ret)
-               pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
-                               (int)flags, irq, chip->set_type);
+               pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
+                      flags, irq, chip->irq_set_type);
         else {
                 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
                         flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
                 desc->status |= flags;
  
-               if (chip != desc->chip)
-                       irq_chip_set_defaults(desc->chip);
+               if (chip != desc->irq_data.chip)
+                       irq_chip_set_defaults(desc->irq_data.chip);
         }
  
         return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
  {
  again:
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         raw_spin_lock_irq(&desc->lock);
  
         /*
@@ -521,17 +522,17 @@ again:
          */
         if (unlikely(desc->status & IRQ_INPROGRESS)) {
                 raw_spin_unlock_irq(&desc->lock);
-               chip_bus_sync_unlock(irq, desc);
+               chip_bus_sync_unlock(desc);
                 cpu_relax();
                 goto again;
         }
  
         if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                 desc->status &= ~IRQ_MASKED;
-               desc->chip->unmask(irq);
+               desc->irq_data.chip->irq_unmask(&desc->irq_data);
         }
         raw_spin_unlock_irq(&desc->lock);
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  }
  
  #ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
         }
  
         raw_spin_lock_irq(&desc->lock);
-       cpumask_copy(mask, desc->affinity);
+       cpumask_copy(mask, desc->irq_data.affinity);
         raw_spin_unlock_irq(&desc->lock);
  
         set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         if (!desc)
                 return -EINVAL;
  
-       if (desc->chip == &no_irq_chip)
+       if (desc->irq_data.chip == &no_irq_chip)
                 return -ENOSYS;
         /*
          * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         }
  
         if (!shared) {
-               irq_chip_set_defaults(desc->chip);
+               irq_chip_set_defaults(desc->irq_data.chip);
  
                 init_waitqueue_head(&desc->wait_for_threads);
  
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 if (!(desc->status & IRQ_NOAUTOEN)) {
                         desc->depth = 0;
                         desc->status &= ~IRQ_DISABLED;
-                       desc->chip->startup(irq);
+                       desc->irq_data.chip->irq_startup(&desc->irq_data);
                 } else
                         /* Undo nested disables: */
                         desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
  
         /* Currently used only by UML, might disappear one day: */
  #ifdef CONFIG_IRQ_RELEASE_METHOD
-       if (desc->chip->release)
-               desc->chip->release(irq, dev_id);
+       if (desc->irq_data.chip->release)
+               desc->irq_data.chip->release(irq, dev_id);
  #endif
  
         /* If this was the last handler, shut down the IRQ line: */
         if (!desc->action) {
                 desc->status |= IRQ_DISABLED;
-               if (desc->chip->shutdown)
-                       desc->chip->shutdown(irq);
+               if (desc->irq_data.chip->irq_shutdown)
+                       desc->irq_data.chip->irq_shutdown(&desc->irq_data);
                 else
-                       desc->chip->disable(irq);
+                       desc->irq_data.chip->irq_disable(&desc->irq_data);
         }
  
  #ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
         if (!desc)
                 return;
  
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         kfree(__free_irq(irq, dev_id));
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  }
  EXPORT_SYMBOL(free_irq);
  
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
         action->name = devname;
         action->dev_id = dev_id;
  
-       chip_bus_lock(irq, desc);
+       chip_bus_lock(desc);
         retval = __setup_irq(irq, desc, action);
-       chip_bus_sync_unlock(irq, desc);
+       chip_bus_sync_unlock(desc);
  
         if (retval)
                 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c

index 2419622..1d25419 100644 (file)
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
  void move_masked_irq(int irq)
  {
         struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_chip *chip = desc->irq_data.chip;
  
         if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                 return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
         if (unlikely(cpumask_empty(desc->pending_mask)))
                 return;
  
-       if (!desc->chip->set_affinity)
+       if (!chip->irq_set_affinity)
                 return;
  
         assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
          */
         if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
                    < nr_cpu_ids))
-               if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
-                       cpumask_copy(desc->affinity, desc->pending_mask);
+               if (!chip->irq_set_affinity(&desc->irq_data,
+                                           desc->pending_mask, false)) {
+                       cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
                         irq_set_thread_affinity(desc);
                 }
  
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
         if (unlikely(desc->status & IRQ_DISABLED))
                 return;
  
-       desc->chip->mask(irq);
+       desc->irq_data.chip->irq_mask(&desc->irq_data);
         move_masked_irq(irq);
-       desc->chip->unmask(irq);
+       desc->irq_data.chip->irq_unmask(&desc->irq_data);
  }
  
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c

deleted file mode 100644 (file)

index 65d3845..0000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * NUMA irq-desc migration code
- *
- * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
- * the new "home node" of the IRQ.
- */
-
-#include <linux/irq.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-
-#include "internals.h"
-
-static void init_copy_kstat_irqs(struct irq_desc *old_desc,
-                                struct irq_desc *desc,
-                                int node, int nr)
-{
-       init_kstat_irqs(desc, node, nr);
-
-       if (desc->kstat_irqs != old_desc->kstat_irqs)
-               memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
-                        nr * sizeof(*desc->kstat_irqs));
-}
-
-static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-       if (old_desc->kstat_irqs == desc->kstat_irqs)
-               return;
-
-       kfree(old_desc->kstat_irqs);
-       old_desc->kstat_irqs = NULL;
-}
-
-static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                struct irq_desc *desc, int node)
-{
-       memcpy(desc, old_desc, sizeof(struct irq_desc));
-       if (!alloc_desc_masks(desc, node, false)) {
-               printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-                               "for migration.\n", irq);
-               return false;
-       }
-       raw_spin_lock_init(&desc->lock);
-       desc->node = node;
-       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
-       init_copy_desc_masks(old_desc, desc);
-       arch_init_copy_chip_data(old_desc, desc, node);
-       return true;
-}
-
-static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
-{
-       free_kstat_irqs(old_desc, desc);
-       free_desc_masks(old_desc, desc);
-       arch_free_chip_data(old_desc, desc);
-}
-
-static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                               int node)
-{
-       struct irq_desc *desc;
-       unsigned int irq;
-       unsigned long flags;
-
-       irq = old_desc->irq;
-
-       raw_spin_lock_irqsave(&sparse_irq_lock, flags);
-
-       /* We have to check it to avoid races with another CPU */
-       desc = irq_to_desc(irq);
-
-       if (desc && old_desc != desc)
-               goto out_unlock;
-
-       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-       if (!desc) {
-               printk(KERN_ERR "irq %d: can not get new irq_desc "
-                               "for migration.\n", irq);
-               /* still use old one */
-               desc = old_desc;
-               goto out_unlock;
-       }
-       if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
-               /* still use old one */
-               kfree(desc);
-               desc = old_desc;
-               goto out_unlock;
-       }
-
-       replace_irq_desc(irq, desc);
-       raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
-       /* free the old one */
-       free_one_irq_desc(old_desc, desc);
-       kfree(old_desc);
-
-       return desc;
-
-out_unlock:
-       raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
-
-       return desc;
-}
-
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
-{
-       /* those static or target node is -1, do not move them */
-       if (desc->irq < NR_IRQS_LEGACY || node == -1)
-               return desc;
-
-       if (desc->node != node)
-               desc = __real_move_irq_desc(desc, node);
-
-       return desc;
-}
-
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c

index 09a2ee5..01b1d3a 100644 (file)
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
  static int irq_affinity_proc_show(struct seq_file *m, void *v)
  {
         struct irq_desc *desc = irq_to_desc((long)m->private);
-       const struct cpumask *mask = desc->affinity;
+       const struct cpumask *mask = desc->irq_data.affinity;
  
  #ifdef CONFIG_GENERIC_PENDING_IRQ
         if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
         cpumask_var_t new_value;
         int err;
  
-       if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
+       if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
             irq_balancing_disabled(irq))
                 return -EIO;
  
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
  {
         struct irq_desc *desc = irq_to_desc((long) m->private);
  
-       seq_printf(m, "%d\n", desc->node);
+       seq_printf(m, "%d\n", desc->irq_data.node);
         return 0;
  }
  
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
  {
         char name [MAX_NAMELEN];
  
-       if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
+       if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
                 return;
  
         memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
                          &irq_spurious_proc_fops, (void *)(long)irq);
  }
  
+void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
+{
+       char name [MAX_NAMELEN];
+
+       if (!root_irq_dir || !desc->dir)
+               return;
+#ifdef CONFIG_SMP
+       remove_proc_entry("smp_affinity", desc->dir);
+       remove_proc_entry("affinity_hint", desc->dir);
+       remove_proc_entry("node", desc->dir);
+#endif
+       remove_proc_entry("spurious", desc->dir);
+
+       memset(name, 0, MAX_NAMELEN);
+       sprintf(name, "%u", irq);
+       remove_proc_entry(name, root_irq_dir);
+}
+
  #undef MAX_NAMELEN
  
  void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c

index 090c376..891115a 100644 (file)
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
         /*
          * Make sure the interrupt is enabled, before resending it:
          */
-       desc->chip->enable(irq);
+       desc->irq_data.chip->irq_enable(&desc->irq_data);
  
         /*
          * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
         if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
                 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
  
-               if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
+               if (!desc->irq_data.chip->irq_retrigger ||
+                   !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
  #ifdef CONFIG_HARDIRQS_SW_RESEND
                         /* Set it pending and activate the softirq: */
                         set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c

index 89fb90a..3089d3b 100644 (file)
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
  #include <linux/moduleparam.h>
  #include <linux/timer.h>
  
+#include "internals.h"
+
  static int irqfixup __read_mostly;
  
  #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
          * If we did actual work for the real IRQ line we must let the
          * IRQ controller clean up too
          */
-       if (work && desc->chip && desc->chip->end)
-               desc->chip->end(irq);
+       if (work)
+               irq_end(irq, desc);
         raw_spin_unlock(&desc->lock);
  
         return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
                 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
                 desc->depth++;
-               desc->chip->disable(irq);
+               desc->irq_data.chip->irq_disable(&desc->irq_data);
  
                 mod_timer(&poll_spurious_irq_timer,
                           jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c

index f2852a5..42ba65d 100644 (file)
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         }
  #endif
  
+       if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+               debug_locks_off();
+               printk(KERN_ERR
+                       "BUG: looking up invalid subclass: %u\n", subclass);
+               printk(KERN_ERR
+                       "turning off the locking correctness validator.\n");
+               dump_stack();
+               return NULL;
+       }
+
         /*
          * Static locks do not have their class-keys yet - for them the key
          * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
         raw_local_irq_restore(flags);
  
         if (!subclass || force)
-               lock->class_cache = class;
+               lock->class_cache[0] = class;
+       else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+               lock->class_cache[subclass] = class;
  
         if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
  void lockdep_init_map(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass)
  {
-       lock->class_cache = NULL;
+       int i;
+
+       for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+               lock->class_cache[i] = NULL;
+
  #ifdef CONFIG_LOCK_STAT
         lock->cpu = raw_smp_processor_id();
  #endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                 return 0;
  
-       if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
-               debug_locks_off();
-               printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
-               printk("turning off the locking correctness validator.\n");
-               dump_stack();
-               return 0;
-       }
-
         if (lock->key == &__lockdep_no_validate__)
                 check = 1;
  
-       if (!subclass)
-               class = lock->class_cache;
+       if (subclass < NR_LOCKDEP_CACHING_CLASSES)
+               class = lock->class_cache[subclass];
         /*
-        * Not cached yet or subclass?
+        * Not cached?
          */
         if (unlikely(!class)) {
                 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                 return 1;
  
         if (hlock->references) {
-               struct lock_class *class = lock->class_cache;
+               struct lock_class *class = lock->class_cache[0];
  
                 if (!class)
                         class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                 if (list_empty(head))
                         continue;
                 list_for_each_entry_safe(class, next, head, hash_entry) {
-                       if (unlikely(class == lock->class_cache)) {
+                       int match = 0;
+
+                       for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
+                               match |= class == lock->class_cache[j];
+
+                       if (unlikely(match)) {
                                 if (debug_locks_off_graph_unlock())
                                         WARN_ON(1);
                                 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
   * Careful: only use this function if you are sure that
   * the task cannot run in parallel!
   */
-void __debug_show_held_locks(struct task_struct *task)
+void debug_show_held_locks(struct task_struct *task)
  {
         if (unlikely(!debug_locks)) {
                 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
         }
         lockdep_print_held_locks(task);
  }
-EXPORT_SYMBOL_GPL(__debug_show_held_locks);
-
-void debug_show_held_locks(struct task_struct *task)
-{
-               __debug_show_held_locks(task);
-}
  EXPORT_SYMBOL_GPL(debug_show_held_locks);
  
  void lockdep_sys_exit(void)
diff --git a/kernel/pid.c b/kernel/pid.c

index d55c6fb..39b65b6 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
         struct task_struct *result = NULL;
         if (pid) {
                 struct hlist_node *first;
-               first = rcu_dereference_check(pid->tasks[type].first,
+               first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                               rcu_read_lock_held() ||
                                               lockdep_tasklist_lock_is_held());
                 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
   */
  struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
  {
+       rcu_lockdep_assert(rcu_read_lock_held());
         return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
  }
  
diff --git a/kernel/printk.c b/kernel/printk.c

index 8fe465a..2531017 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
   * provides serialisation for access to the entire console
   * driver system.
   */
-static DECLARE_MUTEX(console_sem);
+static DEFINE_SEMAPHORE(console_sem);
  struct console *console_drivers;
  EXPORT_SYMBOL_GPL(console_drivers);
  
@@ -556,7 +556,7 @@ static void zap_locks(void)
         /* If a crash is occurring, make sure we can't deadlock */
         spin_lock_init(&logbuf_lock);
         /* And make sure that we print immediately */
-       init_MUTEX(&console_sem);
+       sema_init(&console_sem, 1);
  }
  
  #if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c

index 4d16983..a23a57a 100644 (file)
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
  EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
  
  /**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
   *
   * Check for bottom half being disabled, which covers both the
   * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
   * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
- * will show the situation.
+ * will show the situation.  This is useful for debug checks in functions
+ * that require that they be called within an RCU read-side critical
+ * section.
   *
   * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
   */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
  {
         if (!debug_lockdep_rcu_enabled())
                 return 1;
-       return in_softirq();
+       return in_softirq() || irqs_disabled();
  }
  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
  
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c

index 196ec02..d806735 100644 (file)
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
  EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
+/* Forward declarations for rcutiny_plugin.h. */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __call_rcu(struct rcu_head *head,
+                      void (*func)(struct rcu_head *rcu),
+                      struct rcu_ctrlblk *rcp);
+
+#include "rcutiny_plugin.h"
+
  #ifdef CONFIG_NO_HZ
  
  static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
                 rcu_sched_qs(cpu);
         else if (!in_softirq())
                 rcu_bh_qs(cpu);
+       rcu_preempt_check_callbacks();
  }
  
  /*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
         *rcp->donetail = NULL;
         if (rcp->curtail == rcp->donetail)
                 rcp->curtail = &rcp->rcucblist;
+       rcu_preempt_remove_callbacks(rcp);
         rcp->donetail = &rcp->rcucblist;
         local_irq_restore(flags);
  
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  {
         __rcu_process_callbacks(&rcu_sched_ctrlblk);
         __rcu_process_callbacks(&rcu_bh_ctrlblk);
+       rcu_preempt_process_callbacks();
  }
  
  /*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
  }
  
  /*
- * Post an RCU callback to be invoked after the end of an RCU grace
+ * Post an RCU callback to be invoked after the end of an RCU-sched grace
   * period.  But since we have but one CPU, that would be after any
   * quiescent state.
   */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  {
         __call_rcu(head, func, &rcu_sched_ctrlblk);
  }
-EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_sched);
  
  /*
   * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
  }
  EXPORT_SYMBOL_GPL(call_rcu_bh);
  
-void rcu_barrier(void)
-{
-       struct rcu_synchronize rcu;
-
-       init_rcu_head_on_stack(&rcu.head);
-       init_completion(&rcu.completion);
-       /* Will wake me after RCU finished. */
-       call_rcu(&rcu.head, wakeme_after_rcu);
-       /* Wait for it. */
-       wait_for_completion(&rcu.completion);
-       destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
  void rcu_barrier_bh(void)
  {
         struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
  {
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  }
-
-#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h

index d223a92..6ceca4f 100644 (file)
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
  /*
- * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
   * Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
   *
- * Copyright IBM Corporation, 2009
+ * Copyright (c) 2010 Linaro
   *
   * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
   */
  
+#ifdef CONFIG_TINY_PREEMPT_RCU
+
+#include <linux/delay.h>
+
+/* Global control variables for preemptible RCU. */
+struct rcu_preempt_ctrlblk {
+       struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
+       struct rcu_head **nexttail;
+                               /* Tasks blocked in a preemptible RCU */
+                               /*  read-side critical section while an */
+                               /*  preemptible-RCU grace period is in */
+                               /*  progress must wait for a later grace */
+                               /*  period.  This pointer points to the */
+                               /*  ->next pointer of the last task that */
+                               /*  must wait for a later grace period, or */
+                               /*  to &->rcb.rcucblist if there is no */
+                               /*  such task. */
+       struct list_head blkd_tasks;
+                               /* Tasks blocked in RCU read-side critical */
+                               /*  section.  Tasks are placed at the head */
+                               /*  of this list and age towards the tail. */
+       struct list_head *gp_tasks;
+                               /* Pointer to the first task blocking the */
+                               /*  current grace period, or NULL if there */
+                               /*  is not such task. */
+       struct list_head *exp_tasks;
+                               /* Pointer to first task blocking the */
+                               /*  current expedited grace period, or NULL */
+                               /*  if there is no such task.  If there */
+                               /*  is no current expedited grace period, */
+                               /*  then there cannot be any such task. */
+       u8 gpnum;               /* Current grace period. */
+       u8 gpcpu;               /* Last grace period blocked by the CPU. */
+       u8 completed;           /* Last grace period completed. */
+                               /*  If all three are equal, RCU is idle. */
+};
+
+static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
+       .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
+       .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+};
+
+static int rcu_preempted_readers_exp(void);
+static void rcu_report_exp_done(void);
+
+/*
+ * Return true if the CPU has not yet responded to the current grace period.
+ */
+static int rcu_cpu_blocking_cur_gp(void)
+{
+       return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Check for a running RCU reader.  Because there is only one CPU,
+ * there can be but one running RCU reader at a time.  ;-)
+ */
+static int rcu_preempt_running_reader(void)
+{
+       return current->rcu_read_lock_nesting;
+}
+
+/*
+ * Check for preempted RCU readers blocking any grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_any(void)
+{
+       return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
+}
+
+/*
+ * Check for preempted RCU readers blocking the current grace period.
+ * If the caller needs a reliable answer, it must disable hard irqs.
+ */
+static int rcu_preempt_blocked_readers_cgp(void)
+{
+       return rcu_preempt_ctrlblk.gp_tasks != NULL;
+}
+
+/*
+ * Return true if another preemptible-RCU grace period is needed.
+ */
+static int rcu_preempt_needs_another_gp(void)
+{
+       return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
+}
+
+/*
+ * Return true if a preemptible-RCU grace period is in progress.
+ * The caller must disable hardirqs.
+ */
+static int rcu_preempt_gp_in_progress(void)
+{
+       return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
+}
+
+/*
+ * Record a preemptible-RCU quiescent state for the specified CPU.  Note
+ * that this just means that the task currently running on the CPU is
+ * in a quiescent state.  There might be any number of tasks blocked
+ * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
+ *
+ * Because this is a single-CPU implementation, the only way a grace
+ * period can end is if the CPU is in a quiescent state.  The reason is
+ * that a blocked preemptible-RCU reader can exit its critical section
+ * only if the CPU is running it at the time.  Therefore, when the
+ * last task blocking the current grace period exits its RCU read-side
+ * critical section, neither the CPU nor blocked tasks will be stopping
+ * the current grace period.  (In contrast, SMP implementations
+ * might have CPUs running in RCU read-side critical sections that
+ * block later grace periods -- but this is not possible given only
+ * one CPU.)
+ */
+static void rcu_preempt_cpu_qs(void)
+{
+       /* Record both CPU and task as having responded to current GP. */
+       rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
+       current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+
+       /*
+        * If there is no GP, or if blocked readers are still blocking GP,
+        * then there is nothing more to do.
+        */
+       if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+               return;
+
+       /* Advance callbacks. */
+       rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
+       rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
+       rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
+
+       /* If there are no blocked readers, next GP is done instantly. */
+       if (!rcu_preempt_blocked_readers_any())
+               rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
+
+       /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+       if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
+               raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
+ */
+static void rcu_preempt_start_gp(void)
+{
+       if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
+
+               /* Official start of GP. */
+               rcu_preempt_ctrlblk.gpnum++;
+
+               /* Any blocked RCU readers block new GP. */
+               if (rcu_preempt_blocked_readers_any())
+                       rcu_preempt_ctrlblk.gp_tasks =
+                               rcu_preempt_ctrlblk.blkd_tasks.next;
+
+               /* If there is no running reader, CPU is done with GP. */
+               if (!rcu_preempt_running_reader())
+                       rcu_preempt_cpu_qs();
+       }
+}
+
+/*
+ * We have entered the scheduler, and the current task might soon be
+ * context-switched away from.  If this task is in an RCU read-side
+ * critical section, we will no longer be able to rely on the CPU to
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * If the task started after the current grace period began, as recorded
+ * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
+ * before the element referenced by ->gp_tasks (or at the tail if
+ * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section.  Therefore, the current grace period
+ * cannot be permitted to complete until the ->gp_tasks pointer becomes
+ * NULL.
+ *
+ * Caller must disable preemption.
+ */
+void rcu_preempt_note_context_switch(void)
+{
+       struct task_struct *t = current;
+       unsigned long flags;
+
+       local_irq_save(flags); /* must exclude scheduler_tick(). */
+       if (rcu_preempt_running_reader() &&
+           (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
+
+               /* Possibly blocking in an RCU read-side critical section. */
+               t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
+
+               /*
+                * If this CPU has already checked in, then this task
+                * will hold up the next grace period rather than the
+                * current grace period.  Queue the task accordingly.
+                * If the task is queued for the current grace period
+                * (i.e., this CPU has not yet passed through a quiescent
+                * state for the current grace period), then as long
+                * as that task remains queued, the current grace period
+                * cannot end.
+                */
+               list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
+               if (rcu_cpu_blocking_cur_gp())
+                       rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
+       }
+
+       /*
+        * Either we were not in an RCU read-side critical section to
+        * begin with, or we have now recorded that critical section
+        * globally.  Either way, we can now note a quiescent state
+        * for this CPU.  Again, if we were in an RCU read-side critical
+        * section, and if that critical section was blocking the current
+        * grace period, then the fact that the task has been enqueued
+        * means that current grace period continues to be blocked.
+        */
+       rcu_preempt_cpu_qs();
+       local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+       current->rcu_read_lock_nesting++;
+       barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+/*
+ * Handle special cases during rcu_read_unlock(), such as needing to
+ * notify RCU core processing or task having blocked during the RCU
+ * read-side critical section.
+ */
+static void rcu_read_unlock_special(struct task_struct *t)
+{
+       int empty;
+       int empty_exp;
+       unsigned long flags;
+       struct list_head *np;
+       int special;
+
+       /*
+        * NMI handlers cannot block and cannot safely manipulate state.
+        * They therefore cannot possibly be special, so just leave.
+        */
+       if (in_nmi())
+               return;
+
+       local_irq_save(flags);
+
+       /*
+        * If RCU core is waiting for this CPU to exit critical section,
+        * let it know that we have done so.
+        */
+       special = t->rcu_read_unlock_special;
+       if (special & RCU_READ_UNLOCK_NEED_QS)
+               rcu_preempt_cpu_qs();
+
+       /* Hardware IRQ handlers cannot block. */
+       if (in_irq()) {
+               local_irq_restore(flags);
+               return;
+       }
+
+       /* Clean up if blocked during RCU read-side critical section. */
+       if (special & RCU_READ_UNLOCK_BLOCKED) {
+               t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
+
+               /*
+                * Remove this task from the ->blkd_tasks list and adjust
+                * any pointers that might have been referencing it.
+                */
+               empty = !rcu_preempt_blocked_readers_cgp();
+               empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
+               np = t->rcu_node_entry.next;
+               if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+                       np = NULL;
+               list_del(&t->rcu_node_entry);
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
+                       rcu_preempt_ctrlblk.gp_tasks = np;
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
+                       rcu_preempt_ctrlblk.exp_tasks = np;
+               INIT_LIST_HEAD(&t->rcu_node_entry);
+
+               /*
+                * If this was the last task on the current list, and if
+                * we aren't waiting on the CPU, report the quiescent state
+                * and start a new grace period if needed.
+                */
+               if (!empty && !rcu_preempt_blocked_readers_cgp()) {
+                       rcu_preempt_cpu_qs();
+                       rcu_preempt_start_gp();
+               }
+
+               /*
+                * If this was the last task on the expedited lists,
+                * then we need wake up the waiting task.
+                */
+               if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
+                       rcu_report_exp_done();
+       }
+       local_irq_restore(flags);
+}
+
+/*
+ * Tiny-preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+       struct task_struct *t = current;
+
+       barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
+       --t->rcu_read_lock_nesting;
+       barrier();  /* decrement before load of ->rcu_read_unlock_special */
+       if (t->rcu_read_lock_nesting == 0 &&
+           unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+               rcu_read_unlock_special(t);
+#ifdef CONFIG_PROVE_LOCKING
+       WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * Check for a quiescent state from the current CPU.  When a task blocks,
+ * the task is recorded in the rcu_preempt_ctrlblk structure, which is
+ * checked elsewhere.  This is called from the scheduling-clock interrupt.
+ *
+ * Caller must disable hard irqs.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+       struct task_struct *t = current;
+
+       if (rcu_preempt_gp_in_progress() &&
+           (!rcu_preempt_running_reader() ||
+            !rcu_cpu_blocking_cur_gp()))
+               rcu_preempt_cpu_qs();
+       if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
+           rcu_preempt_ctrlblk.rcb.donetail)
+               raise_softirq(RCU_SOFTIRQ);
+       if (rcu_preempt_gp_in_progress() &&
+           rcu_cpu_blocking_cur_gp() &&
+           rcu_preempt_running_reader())
+               t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
+}
+
+/*
+ * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
+ * update, so this is invoked from __rcu_process_callbacks() to
+ * handle that case.  Of course, it is invoked for all flavors of
+ * RCU, but RCU callbacks can appear only on one of the lists, and
+ * neither ->nexttail nor ->donetail can possibly be NULL, so there
+ * is no need for an explicit check.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+       if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
+               rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
+}
+
+/*
+ * Process callbacks for preemptible RCU.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+       __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+}
+
+/*
+ * Queue a preemptible -RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+       unsigned long flags;
+
+       debug_rcu_head_queue(head);
+       head->func = func;
+       head->next = NULL;
+
+       local_irq_save(flags);
+       *rcu_preempt_ctrlblk.nexttail = head;
+       rcu_preempt_ctrlblk.nexttail = &head->next;
+       rcu_preempt_start_gp();  /* checks to see if GP needed. */
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+void rcu_barrier(void)
+{
+       struct rcu_synchronize rcu;
+
+       init_rcu_head_on_stack(&rcu.head);
+       init_completion(&rcu.completion);
+       /* Will wake me after RCU finished. */
+       call_rcu(&rcu.head, wakeme_after_rcu);
+       /* Wait for it. */
+       wait_for_completion(&rcu.completion);
+       destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/*
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void synchronize_rcu(void)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       if (!rcu_scheduler_active)
+               return;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+       WARN_ON_ONCE(rcu_preempt_running_reader());
+       if (!rcu_preempt_blocked_readers_any())
+               return;
+
+       /* Once we get past the fastpath checks, same code as rcu_barrier(). */
+       rcu_barrier();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
+static unsigned long sync_rcu_preempt_exp_count;
+static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
+
+/*
+ * Return non-zero if there are any tasks in RCU read-side critical
+ * sections blocking the current preemptible-RCU expedited grace period.
+ * If there is no preemptible-RCU expedited grace period currently in
+ * progress, returns zero unconditionally.
+ */
+static int rcu_preempted_readers_exp(void)
+{
+       return rcu_preempt_ctrlblk.exp_tasks != NULL;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period.
+ */
+static void rcu_report_exp_done(void)
+{
+       wake_up(&sync_rcu_preempt_exp_wq);
+}
+
+/*
+ * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
+ * is to rely in the fact that there is but one CPU, and that it is
+ * illegal for a task to invoke synchronize_rcu_expedited() while in a
+ * preemptible-RCU read-side critical section.  Therefore, any such
+ * critical sections must correspond to blocked tasks, which must therefore
+ * be on the ->blkd_tasks list.  So just record the current head of the
+ * list in the ->exp_tasks pointer, and wait for all tasks including and
+ * after the task pointed to by ->exp_tasks to drain.
+ */
+void synchronize_rcu_expedited(void)
+{
+       unsigned long flags;
+       struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
+       unsigned long snap;
+
+       barrier(); /* ensure prior action seen before grace period. */
+
+       WARN_ON_ONCE(rcu_preempt_running_reader());
+
+       /*
+        * Acquire lock so that there is only one preemptible RCU grace
+        * period in flight.  Of course, if someone does the expedited
+        * grace period for us while we are acquiring the lock, just leave.
+        */
+       snap = sync_rcu_preempt_exp_count + 1;
+       mutex_lock(&sync_rcu_preempt_exp_mutex);
+       if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
+               goto unlock_mb_ret; /* Others did our work for us. */
+
+       local_irq_save(flags);
+
+       /*
+        * All RCU readers have to already be on blkd_tasks because
+        * we cannot legally be executing in an RCU read-side critical
+        * section.
+        */
+
+       /* Snapshot current head of ->blkd_tasks list. */
+       rpcp->exp_tasks = rpcp->blkd_tasks.next;
+       if (rpcp->exp_tasks == &rpcp->blkd_tasks)
+               rpcp->exp_tasks = NULL;
+       local_irq_restore(flags);
+
+       /* Wait for tail of ->blkd_tasks list to drain. */
+       if (rcu_preempted_readers_exp())
+               wait_event(sync_rcu_preempt_exp_wq,
+                          !rcu_preempted_readers_exp());
+
+       /* Clean up and exit. */
+       barrier(); /* ensure expedited GP seen before counter increment. */
+       sync_rcu_preempt_exp_count++;
+unlock_mb_ret:
+       mutex_unlock(&sync_rcu_preempt_exp_mutex);
+       barrier(); /* ensure subsequent action seen after grace period. */
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
+ */
+int rcu_preempt_needs_cpu(void)
+{
+       if (!rcu_preempt_running_reader())
+               rcu_preempt_cpu_qs();
+       return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
+}
+
+/*
+ * Check for a task exiting while in a preemptible -RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+       struct task_struct *t = current;
+
+       if (t->rcu_read_lock_nesting == 0)
+               return;
+       t->rcu_read_lock_nesting = 1;
+       rcu_read_unlock();
+}
+
+#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to check.
+ */
+static void rcu_preempt_check_callbacks(void)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to remove.
+ */
+static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, it never has any callbacks
+ * to process.
+ */
+static void rcu_preempt_process_callbacks(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  
  #include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c

index 2e2726d..9d8e8fb 100644 (file)
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
  };
  
  static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture *rcu_torture_current;
+static struct rcu_torture __rcu *rcu_torture_current;
  static long rcu_torture_current_version;
  static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
  static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
  #define FULLSTOP_SHUTDOWN 1    /* System shutdown with rcutorture running. */
  #define FULLSTOP_RMMOD    2    /* Normal rmmod of rcutorture. */
  static int fullstop = FULLSTOP_RMMOD;
-DEFINE_MUTEX(fullstop_mutex);  /* Protect fullstop transitions and spawning */
-                               /*  of kthreads. */
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
  
  /*
   * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
                 mdelay(longdelay_ms);
         if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
                 udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+       if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+               preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
  }
  
  static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
         delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
         if (!delay)
                 schedule_timeout_interruptible(longdelay);
+       else
+               rcu_read_delay(rrsp);
  }
  
  static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
                         continue;
                 rp->rtort_pipe_count = 0;
                 udelay(rcu_random(&rand) & 0x3ff);
-               old_rp = rcu_torture_current;
+               old_rp = rcu_dereference_check(rcu_torture_current,
+                                              current == writer_task);
                 rp->rtort_mbtest = 1;
                 rcu_assign_pointer(rcu_torture_current, rp);
                 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index d5bc439..ccdc04c 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
  module_param(qhimark, int, 0);
  module_param(qlowmark, int, 0);
  
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
  static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
  static int rcu_pending(int cpu);
  
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
  
  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
  
-int rcu_cpu_stall_panicking __read_mostly;
+int rcu_cpu_stall_suppress __read_mostly;
  
  static void record_gp_stall_check_time(struct rcu_state *rsp)
  {
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         rcu_print_task_stall(rnp);
         raw_spin_unlock_irqrestore(&rnp->lock, flags);
  
-       /* OK, time to rat on our buddy... */
-
+       /*
+        * OK, time to rat on our buddy...
+        * See Documentation/RCU/stallwarn.txt for info on how to debug
+        * RCU CPU stall warnings.
+        */
         printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
                rsp->name);
         rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
         unsigned long flags;
         struct rcu_node *rnp = rcu_get_root(rsp);
  
+       /*
+        * OK, time to rat on ourselves...
+        * See Documentation/RCU/stallwarn.txt for info on how to debug
+        * RCU CPU stall warnings.
+        */
         printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
                rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
         trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
         long delta;
         struct rcu_node *rnp;
  
-       if (rcu_cpu_stall_panicking)
+       if (rcu_cpu_stall_suppress)
                 return;
-       delta = jiffies - rsp->jiffies_stall;
+       delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
         rnp = rdp->mynode;
-       if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+       if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
  
                 /* We haven't checked in, so go dump stack. */
                 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  
  static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
  {
-       rcu_cpu_stall_panicking = 1;
+       rcu_cpu_stall_suppress = 1;
         return NOTIFY_DONE;
  }
  
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+       rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+       rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+       rcu_preempt_stall_reset();
+}
+
  static struct notifier_block rcu_panic_block = {
         .notifier_call = rcu_panic,
  };
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
  {
  }
  
+void rcu_cpu_stall_reset(void)
+{
+}
+
  static void __init check_cpu_stall_init(void)
  {
  }
@@ -712,7 +745,7 @@ static void
  rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
         __releases(rcu_get_root(rsp)->lock)
  {
-       struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
  static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
  {
         int i;
-       struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+       struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
  
         if (rdp->nxtlist == NULL)
                 return;  /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
         for (i = 0; i < RCU_NEXT_SIZE; i++)
                 rdp->nxttail[i] = &rdp->nxtlist;
         rsp->orphan_qlen += rdp->qlen;
+       rdp->n_cbs_orphaned += rdp->qlen;
         rdp->qlen = 0;
         raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
  }
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
         struct rcu_data *rdp;
  
         raw_spin_lock_irqsave(&rsp->onofflock, flags);
-       rdp = rsp->rda[smp_processor_id()];
+       rdp = this_cpu_ptr(rsp->rda);
         if (rsp->orphan_cbs_list == NULL) {
                 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
                 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
         *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
         rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
         rdp->qlen += rsp->orphan_qlen;
+       rdp->n_cbs_adopted += rsp->orphan_qlen;
         rsp->orphan_cbs_list = NULL;
         rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
         rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
         unsigned long flags;
         unsigned long mask;
         int need_report = 0;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp;
  
         /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  
         /* Update count, and requeue any remaining callbacks. */
         rdp->qlen -= count;
+       rdp->n_cbs_invoked += count;
         if (list != NULL) {
                 *tail = rdp->nxtlist;
                 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                 cpu = rnp->grplo;
                 bit = 1;
                 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                       if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+                       if ((rnp->qsmask & bit) != 0 &&
+                           f(per_cpu_ptr(rsp->rda, cpu)))
                                 mask |= bit;
                 }
                 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
          * a quiescent state betweentimes.
          */
         local_irq_save(flags);
-       rdp = rsp->rda[smp_processor_id()];
+       rdp = this_cpu_ptr(rsp->rda);
         rcu_process_gp_end(rsp, rdp);
         check_for_new_grace_period(rsp, rdp);
  
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
  {
         unsigned long flags;
         int i;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
  {
         unsigned long flags;
         unsigned long mask;
-       struct rcu_data *rdp = rsp->rda[cpu];
+       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
         struct rcu_node *rnp = rcu_get_root(rsp);
  
         /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
  /*
   * Helper function for rcu_init() that initializes one rcu_state structure.
   */
-static void __init rcu_init_one(struct rcu_state *rsp)
+static void __init rcu_init_one(struct rcu_state *rsp,
+               struct rcu_data __percpu *rda)
  {
         static char *buf[] = { "rcu_node_level_0",
                                "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
                 }
         }
  
+       rsp->rda = rda;
         rnp = rsp->level[NUM_RCU_LVLS - 1];
         for_each_possible_cpu(i) {
                 while (i > rnp->grphi)
                         rnp++;
-               rsp->rda[i]->mynode = rnp;
+               per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                 rcu_boot_init_percpu_data(i, rsp);
         }
  }
  
-/*
- * Helper macro for __rcu_init() and __rcu_init_preempt().  To be used
- * nowhere else!  Assigns leaf node pointers into each CPU's rcu_data
- * structure.
- */
-#define RCU_INIT_FLAVOR(rsp, rcu_data) \
-do { \
-       int i; \
-       \
-       for_each_possible_cpu(i) { \
-               (rsp)->rda[i] = &per_cpu(rcu_data, i); \
-       } \
-       rcu_init_one(rsp); \
-} while (0)
-
  void __init rcu_init(void)
  {
         int cpu;
  
         rcu_bootup_announce();
-       RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
-       RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
+       rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+       rcu_init_one(&rcu_bh_state, &rcu_bh_data);
         __rcu_init_preempt();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
  
diff --git a/kernel/rcutree.h b/kernel/rcutree.h

index 14c040b..91d4170 100644 (file)
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
         long            qlen;           /* # of queued callbacks */
         long            qlen_last_fqs_check;
                                         /* qlen at last check for QS forcing */
+       unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+       unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
+       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
         unsigned long   n_force_qs_snap;
                                         /* did other CPU force QS recently? */
         long            blimit;         /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
  #define RCU_STALL_DELAY_DELTA         0
  #endif
  
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_CHECK   (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
+                                       RCU_STALL_DELAY_DELTA)
                                                 /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
                                                 /* for rsp->jiffies_stall */
  #define RCU_STALL_RAT_DELAY            2       /* Allow other CPUs time */
                                                 /*  to take at least one */
                                                 /*  scheduling clock irq */
                                                 /*  before ratting on them. */
  
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
+#define RCU_CPU_STALL_SUPPRESS_INIT 0
+#else
+#define RCU_CPU_STALL_SUPPRESS_INIT 1
+#endif
  
-#define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
-#define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
   * RCU global state, including node hierarchy.  This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
         struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
         u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
         u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
-       struct rcu_data *rda[NR_CPUS];          /* array of rdp pointers. */
+       struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
  
         /* The following fields are guarded by the root rcu_node's lock. */
  
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
  static void rcu_print_detail_task_stall(struct rcu_state *rsp);
  static void rcu_print_task_stall(struct rcu_node *rnp);
+static void rcu_preempt_stall_reset(void);
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
  #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h

index 0e4f420..71a4147 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
         printk(KERN_INFO
                "\tRCU-based detection of stalled CPUs is disabled.\n");
  #endif
-#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
         printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
  #endif
  #if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
             (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
  
                 /* Possibly blocking in an RCU read-side critical section. */
-               rdp = rcu_preempt_state.rda[cpu];
+               rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
                 rnp = rdp->mynode;
                 raw_spin_lock_irqsave(&rnp->lock, flags);
                 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
   */
  void __rcu_read_lock(void)
  {
-       ACCESS_ONCE(current->rcu_read_lock_nesting)++;
+       current->rcu_read_lock_nesting++;
         barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
  }
  EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
         struct task_struct *t = current;
  
         barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
-       if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
+       --t->rcu_read_lock_nesting;
+       barrier();  /* decrement before load of ->rcu_read_unlock_special */
+       if (t->rcu_read_lock_nesting == 0 &&
             unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                 rcu_read_unlock_special(t);
  #ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
         }
  }
  
+/*
+ * Suppress preemptible RCU's CPU stall warnings by pushing the
+ * time of the next stall-warning message comfortably far into the
+ * future.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+       rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+}
+
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
   *
   * Control will return to the caller some time after a full grace
   * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * read-side critical sections have completed.  Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting.  RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
   */
  void synchronize_rcu(void)
  {
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
   */
  static void __init __rcu_init_preempt(void)
  {
-       RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+       rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
  }
  
  /*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
  {
  }
  
+/*
+ * Because preemptible RCU does not exist, there is no need to suppress
+ * its CPU stall warnings.
+ */
+static void rcu_preempt_stall_reset(void)
+{
+}
+
  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
  
  /*
@@ -918,15 +940,6 @@ static void rcu_preempt_process_callbacks(void)
  {
  }
  
-/*
- * In classic RCU, call_rcu() is just call_rcu_sched().
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-       call_rcu_sched(head, func);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
  /*
   * Wait for an rcu-preempt grace period, but make it happen quickly.
   * But because preemptable RCU does not exist, map to rcu-sched.
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c

index 36c95b4..d15430b 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                    rdp->dynticks_fqs);
  #endif /* #ifdef CONFIG_NO_HZ */
         seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
-       seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+       seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+       seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+                  rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
  }
  
  #define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                    rdp->dynticks_fqs);
  #endif /* #ifdef CONFIG_NO_HZ */
         seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
-       seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+       seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+       seq_printf(m, ",%lu,%lu,%lu\n",
+                  rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
  }
  
  static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
  #ifdef CONFIG_NO_HZ
         seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
  #endif /* #ifdef CONFIG_NO_HZ */
-       seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+       seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
  #ifdef CONFIG_TREE_PREEMPT_RCU
         seq_puts(m, "\"rcu_preempt:\"\n");
         PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
         struct rcu_data *rdp;
  
         for_each_possible_cpu(cpu) {
-               rdp = rsp->rda[cpu];
+               rdp = per_cpu_ptr(rsp->rda, cpu);
                 if (rdp->beenonline)
                         print_one_rcu_pending(m, rdp);
         }
diff --git a/kernel/sched.c b/kernel/sched.c

index c0d2067..d42992b 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
          */
         cpumask_var_t rto_mask;
         atomic_t rto_count;
-#ifdef CONFIG_SMP
         struct cpupri cpupri;
-#endif
  };
  
  /*
@@ -437,7 +435,7 @@ struct root_domain {
   */
  static struct root_domain def_root_domain;
  
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
          */
         unsigned long nr_uninterruptible;
  
-       struct task_struct *curr, *idle;
+       struct task_struct *curr, *idle, *stop;
         unsigned long next_balance;
         struct mm_struct *prev_mm;
  
         u64 clock;
+       u64 clock_task;
  
         atomic_t nr_iowait;
  
@@ -520,6 +519,10 @@ struct rq {
         u64 avg_idle;
  #endif
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+#endif
+
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
  
  #endif /* CONFIG_CGROUP_SCHED */
  
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
  inline void update_rq_clock(struct rq *rq)
  {
-       if (!rq->skip_clock_update)
-               rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update) {
+               int cpu = cpu_of(rq);
+               u64 irq_time;
+
+               rq->clock = sched_clock_cpu(cpu);
+               irq_time = irq_time_cpu(cpu);
+               if (rq->clock - irq_time > rq->clock_task)
+                       rq->clock_task = rq->clock - irq_time;
+
+               sched_irq_time_avg_update(rq, irq_time);
+       }
  }
  
  /*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 size_t cnt, loff_t *ppos)
  {
         char buf[64];
-       char *cmp = buf;
+       char *cmp;
         int neg = 0;
         int i;
  
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 return -EFAULT;
  
         buf[cnt] = 0;
+       cmp = strstrip(buf);
  
         if (strncmp(buf, "NO_", 3) == 0) {
                 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         }
  
         for (i = 0; sched_feat_names[i]; i++) {
-               int len = strlen(sched_feat_names[i]);
-
-               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
                         if (neg)
                                 sysctl_sched_features &= ~(1UL << i);
                         else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  
  static const struct sched_class rt_sched_class;
  
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
  #define for_each_class(class) \
     for (class = sched_class_highest; class; class = class->next)
  
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
  
  static void set_load_weight(struct task_struct *p)
  {
-       if (task_has_rt_policy(p)) {
-               p->se.load.weight = 0;
-               p->se.load.inv_weight = WMULT_CONST;
-               return;
-       }
-
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
         dec_nr_running(rq);
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+       if (!sched_clock_irqtime)
+               return 0;
+
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       int cpu;
+       u64 now, delta;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       now = sched_clock_cpu(cpu);
+       delta = now - per_cpu(irq_start_time, cpu);
+       per_cpu(irq_start_time, cpu) = now;
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               per_cpu(cpu_hardirq_time, cpu) += delta;
+       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+               per_cpu(cpu_softirq_time, cpu) += delta;
+
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+               rq->prev_irq_time = curr_irq_time;
+               sched_rt_avg_update(rq, delta_irq);
+       }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+       return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
  #endif
  
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+               stop->sched_class = &stop_sched_class;
+       }
+
+       cpu_rq(cpu)->stop = stop;
+
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+}
+
  /*
   * __normal_prio - return the priority that is based on the static prio
   */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         if (p->sched_class != &fair_sched_class)
                 return 0;
  
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+
         /*
          * Buddy candidates are cache hot:
          */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
          */
         arch_start_context_switch(prev);
  
-       if (likely(!mm)) {
+       if (!mm) {
                 next->active_mm = oldmm;
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
                 switch_mm(oldmm, mm, next);
  
-       if (likely(!prev->mm)) {
+       if (!prev->mm) {
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
-               ns = rq->clock - p->se.exec_start;
+               ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
+       else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
                         return p;
         }
  
-       class = sched_class_highest;
-       for ( ; ; ) {
+       for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
                         return p;
-               /*
-                * Will never be NULL as the idle class always
-                * returns a non-NULL p:
-                */
-               class = class->next;
         }
+
+       BUG(); /* the idle class will always have a runnable task */
  }
  
  /*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         rq = task_rq_lock(p, &flags);
  
+       trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
         }
  
         if (user) {
-               retval = security_task_setscheduler(p, policy, param);
+               retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@ -4661,6 +4788,15 @@ recheck:
          */
         rq = __task_rq_lock(p);
  
+       /*
+        * Changing the policy of the stop threads its a very bad idea
+        */
+       if (p == rq->stop) {
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return -EINVAL;
+       }
+
  #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
  
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
         retval = set_cpus_allowed_ptr(p, new_mask);
  
         if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
         __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
         cpumask_var_t           nodemask;
         cpumask_var_t           this_sibling_map;
         cpumask_var_t           this_core_map;
+       cpumask_var_t           this_book_map;
         cpumask_var_t           send_covered;
         cpumask_var_t           tmpmask;
         struct sched_group      **sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
         sa_rootdomain,
         sa_tmpmask,
         sa_send_covered,
+       sa_this_book_map,
         sa_this_core_map,
         sa_this_sibling_map,
         sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
  #ifdef CONFIG_SCHED_MC
  static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-#endif /* CONFIG_SCHED_MC */
  
-#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
  static int
  cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
  {
         int group;
-
+#ifdef CONFIG_SCHED_SMT
         cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
         group = cpumask_first(mask);
+#else
+       group = cpu;
+#endif
         if (sg)
                 *sg = &per_cpu(sched_group_core, group).sg;
         return group;
  }
-#elif defined(CONFIG_SCHED_MC)
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * book sched-domains:
+ */
+#ifdef CONFIG_SCHED_BOOK
+static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
+
  static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-                 struct sched_group **sg, struct cpumask *unused)
+cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
  {
+       int group = cpu;
+#ifdef CONFIG_SCHED_MC
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_SMT)
+       cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#endif
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu).sg;
-       return cpu;
+               *sg = &per_cpu(sched_group_book, group).sg;
+       return group;
  }
-#endif
+#endif /* CONFIG_SCHED_BOOK */
  
  static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
                   struct sched_group **sg, struct cpumask *mask)
  {
         int group;
-#ifdef CONFIG_SCHED_MC
+#ifdef CONFIG_SCHED_BOOK
+       cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
+#elif defined(CONFIG_SCHED_MC)
         cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
  #ifdef CONFIG_SCHED_MC
   SD_INIT_FUNC(MC)
  #endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
  
  static int default_relax_domain_level = -1;
  
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                 free_cpumask_var(d->tmpmask); /* fall through */
         case sa_send_covered:
                 free_cpumask_var(d->send_covered); /* fall through */
+       case sa_this_book_map:
+               free_cpumask_var(d->this_book_map); /* fall through */
         case sa_this_core_map:
                 free_cpumask_var(d->this_core_map); /* fall through */
         case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                 return sa_nodemask;
         if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
                 return sa_this_sibling_map;
-       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+       if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
                 return sa_this_core_map;
+       if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+               return sa_this_book_map;
         if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
                 return sa_send_covered;
         d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
         return sd;
  }
  
+static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+       const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+       struct sched_domain *parent, int i)
+{
+       struct sched_domain *sd = parent;
+#ifdef CONFIG_SCHED_BOOK
+       sd = &per_cpu(book_domains, i).sd;
+       SD_INIT(sd, BOOK);
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
+       sd->parent = parent;
+       parent->child = sd;
+       cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
+#endif
+       return sd;
+}
+
  static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
         const struct cpumask *cpu_map, struct sched_domain_attr *attr,
         struct sched_domain *parent, int i)
@@ -7065,6 +7259,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
                                                 &cpu_to_core_group,
                                                 d->send_covered, d->tmpmask);
                 break;
+#endif
+#ifdef CONFIG_SCHED_BOOK
+       case SD_LV_BOOK: /* set up book groups */
+               cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
+               if (cpu == cpumask_first(d->this_book_map))
+                       init_sched_build_groups(d->this_book_map, cpu_map,
+                                               &cpu_to_book_group,
+                                               d->send_covered, d->tmpmask);
+               break;
  #endif
         case SD_LV_CPU: /* set up physical groups */
                 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  
                 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
                 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+               sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
                 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
         }
  
         for_each_cpu(i, cpu_map) {
                 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+               build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
                 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
         }
  
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 init_sched_groups_power(i, sd);
         }
  #endif
+#ifdef CONFIG_SCHED_BOOK
+       for_each_cpu(i, cpu_map) {
+               sd = &per_cpu(book_domains, i).sd;
+               init_sched_groups_power(i, sd);
+       }
+#endif
  
         for_each_cpu(i, cpu_map) {
                 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 sd = &per_cpu(cpu_domains, i).sd;
  #elif defined(CONFIG_SCHED_MC)
                 sd = &per_cpu(core_domains, i).sd;
+#elif defined(CONFIG_SCHED_BOOK)
+               sd = &per_cpu(book_domains, i).sd;
  #else
                 sd = &per_cpu(phys_domains, i).sd;
  #endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(cfs_rq);
- err:
+err:
         return 0;
  }
  
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(rt_rq);
- err:
+err:
         return 0;
  }
  
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
         }
         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
  
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index db3f674..933f3d1 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
   *
   * NOTE: this latency value is not the same as the concept of
   * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
  
  /*
   * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
   */
  unsigned int sysctl_sched_min_granularity = 750000ULL;
  unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  static void update_curr(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock;
+       u64 now = rq_of(cfs_rq)->clock_task;
         unsigned long delta_exec;
  
         if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
         /*
          * We are starting a new run period:
          */
-       se->exec_start = rq_of(cfs_rq)->clock;
+       se->exec_start = rq_of(cfs_rq)->clock_task;
  }
  
  /**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
         set_task_cpu(p, this_cpu);
         activate_task(this_rq, p, 0);
         check_preempt_curr(this_rq, p, 0);
+
+       /* re-arm NEWIDLE balancing when moving tasks */
+       src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+       this_rq->idle_stamp = 0;
  }
  
  /*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
          * 2) too many balance attempts have failed.
          */
  
-       tsk_cache_hot = task_hot(p, rq->clock, sd);
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
         if (!tsk_cache_hot ||
                 sd->nr_balance_failed > sd->cache_nice_tries) {
  #ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
         unsigned long this_load;
         unsigned long this_load_per_task;
         unsigned long this_nr_running;
+       unsigned long this_has_capacity;
  
         /* Statistics of the busiest group */
         unsigned long max_load;
         unsigned long busiest_load_per_task;
         unsigned long busiest_nr_running;
         unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
  
         int group_imb; /* Is there imbalance in this sd */
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
         unsigned long group_capacity;
         int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
  };
  
  /**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
         u64 total, available;
  
         total = sched_avg_period() + (rq->clock - rq->age_stamp);
-       available = total - rq->rt_avg;
+
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
  
         if (unlikely((s64)total < SCHED_LOAD_SCALE))
                 total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
  {
-       unsigned long load, max_cpu_load, min_cpu_load;
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
         int i;
         unsigned int balance_cpu = -1, first_idle_cpu = 0;
         unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         /* Tally up the load of all CPUs in the group */
         max_cpu_load = 0;
         min_cpu_load = ~0UL;
+       max_nr_running = 0;
  
         for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                 struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                         load = target_load(i, load_idx);
                 } else {
                         load = source_load(i, load_idx);
-                       if (load > max_cpu_load)
+                       if (load > max_cpu_load) {
                                 max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
                         if (min_cpu_load > load)
                                 min_cpu_load = load;
                 }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         if (sgs->sum_nr_running)
                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+       if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                 sgs->group_imb = 1;
  
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
         if (!sgs->group_capacity)
                 sgs->group_capacity = fix_small_capacity(sd, group);
+
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
  }
  
  /**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the sg capacity to one so that we'll try
-                * and move all the excess tasks away.
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling)
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
  
                 if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                         sds->this = sg;
                         sds->this_nr_running = sgs.sum_nr_running;
                         sds->this_load_per_task = sgs.sum_weighted_load;
+                       sds->this_has_capacity = sgs.group_has_capacity;
                 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                         sds->max_load = sgs.avg_load;
                         sds->busiest = sg;
                         sds->busiest_nr_running = sgs.sum_nr_running;
                         sds->busiest_group_capacity = sgs.group_capacity;
                         sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
                         sds->group_imb = sgs.group_imb;
                 }
  
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                 return fix_small_imbalance(sds, this_cpu, imbalance);
  
  }
+
  /******* find_busiest_group() helpers end here *********************/
  
  /**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * 4) This group is more busy than the avg busieness at this
          *    sched_domain.
          * 5) The imbalance is within the specified limit.
+        *
+        * Note: when doing newidle balance, if the local group has excess
+        * capacity (i.e. nr_running < group_capacity) and the busiest group
+        * does not have any capacity, we force a load balance to pull tasks
+        * to the local group. In this case, we skip past checks 3, 4 and 5.
          */
         if (!(*balance))
                 goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (!sds.busiest || sds.busiest_nr_running == 0)
                 goto out_balanced;
  
+       /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+
         if (sds.this_load >= sds.max_load)
                 goto out_balanced;
  
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                 goto out_balanced;
  
+force_balance:
         /* Looks like there is an imbalance. Compute it */
         calculate_imbalance(&sds, this_cpu, imbalance);
         return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
  
         if (!ld_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
-               sd->nr_balance_failed++;
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
  
                 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                         this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
                         next_balance = sd->last_balance + interval;
-               if (pulled_task) {
-                       this_rq->idle_stamp = 0;
+               if (pulled_task)
                         break;
-               }
         }
  
         raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
  
         update_rq_clock(rq);
  
-       if (unlikely(task_cpu(p) != this_cpu))
+       if (unlikely(task_cpu(p) != this_cpu)) {
+               rcu_read_lock();
                 __set_task_cpu(p, this_cpu);
+               rcu_read_unlock();
+       }
  
         update_curr(cfs_rq);
  
diff --git a/kernel/sched_features.h b/kernel/sched_features.h

index 83c66e8..185f920 100644 (file)
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
   * release the lock. Decreases scheduling overhead.
   */
  SCHED_FEAT(OWNER_SPIN, 1)
+
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index d10c80e..bea7d79 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
         if (!task_has_rt_policy(curr))
                 return;
  
-       delta_exec = rq->clock - curr->se.exec_start;
+       delta_exec = rq->clock_task - curr->se.exec_start;
         if (unlikely((s64)delta_exec < 0))
                 delta_exec = 0;
  
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
         curr->se.sum_exec_runtime += delta_exec;
         account_group_exec_runtime(curr, delta_exec);
  
-       curr->se.exec_start = rq->clock;
+       curr->se.exec_start = rq->clock_task;
         cpuacct_charge(curr, delta_exec);
  
         sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
          * runqueue. Otherwise simply start this RT task
          * on its current runqueue.
          *
-        * We want to avoid overloading runqueues. Even if
-        * the RT task is of higher priority than the current RT task.
-        * RT tasks behave differently than other tasks. If
-        * one gets preempted, we try to push it off to another queue.
-        * So trying to keep a preempting RT task on the same
-        * cache hot CPU will force the running RT task to
-        * a cold CPU. So we waste all the cache for the lower
-        * RT task in hopes of saving some of a RT task
-        * that is just being woken and probably will have
-        * cold cache anyway.
+        * We want to avoid overloading runqueues. If the woken
+        * task is a higher priority, then it will stay on this CPU
+        * and the lower prio task should be moved to another CPU.
+        * Even though this will probably make the lower prio task
+        * lose its cache, we do not want to bounce a higher task
+        * around just because it gave up its CPU, perhaps for a
+        * lock?
+        *
+        * For equal prio tasks, we just let the scheduler sort it out.
          */
         if (unlikely(rt_task(rq->curr)) &&
+           (rq->curr->rt.nr_cpus_allowed < 2 ||
+            rq->curr->prio < p->prio) &&
             (p->rt.nr_cpus_allowed > 1)) {
                 int cpu = find_lowest_rq(p);
  
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
         } while (rt_rq);
  
         p = rt_task_of(rt_se);
-       p->se.exec_start = rq->clock;
+       p->se.exec_start = rq->clock_task;
  
         return p;
  }
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
         for_each_leaf_rt_rq(rt_rq, rq) {
                 array = &rt_rq->active;
                 idx = sched_find_first_bit(array->bitmap);
- next_idx:
+next_idx:
                 if (idx >= MAX_RT_PRIO)
                         continue;
                 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
         if (!next_task)
                 return 0;
  
- retry:
+retry:
         if (unlikely(next_task == rq->curr)) {
                 WARN_ON(1);
                 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
                          * but possible)
                          */
                 }
- skip:
+skip:
                 double_unlock_balance(this_rq, src_rq);
         }
  
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
         if (!task_running(rq, p) &&
             !test_tsk_need_resched(rq->curr) &&
             has_pushable_tasks(rq) &&
-           p->rt.nr_cpus_allowed > 1)
+           p->rt.nr_cpus_allowed > 1 &&
+           rt_task(rq->curr) &&
+           (rq->curr->rt.nr_cpus_allowed < 2 ||
+            rq->curr->prio < p->prio))
                 push_rt_tasks(rq);
  }
  
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
  {
         struct task_struct *p = rq->curr;
  
-       p->se.exec_start = rq->clock;
+       p->se.exec_start = rq->clock_task;
  
         /* The running task is never eligible for pushing */
         dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c

new file mode 100644 (file)

index 0000000..45bddc0
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct rq *rq, struct task_struct *p,
+                   int sd_flag, int flags)
+{
+       return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+       resched_task(rq->curr); /* we preempt everything */
+}
+
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+       struct task_struct *stop = rq->stop;
+
+       if (stop && stop->state == TASK_RUNNING)
+               return stop;
+
+       return NULL;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+       BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p,
+                            int running)
+{
+       BUG(); /* its impossible to change to this class */
+}
+
+static void prio_changed_stop(struct rq *rq, struct task_struct *p,
+                             int oldprio, int running)
+{
+       BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+       return 0;
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+static const struct sched_class stop_sched_class = {
+       .next                   = &rt_sched_class,
+
+       .enqueue_task           = enqueue_task_stop,
+       .dequeue_task           = dequeue_task_stop,
+       .yield_task             = yield_task_stop,
+
+       .check_preempt_curr     = check_preempt_curr_stop,
+
+       .pick_next_task         = pick_next_task_stop,
+       .put_prev_task          = put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+       .select_task_rq         = select_task_rq_stop,
+#endif
+
+       .set_curr_task          = set_curr_task_stop,
+       .task_tick              = task_tick_stop,
+
+       .get_rr_interval        = get_rr_interval_stop,
+
+       .prio_changed           = prio_changed_stop,
+       .switched_to            = switched_to_stop,
+
+       /* no .task_new for stop tasks */
+};
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 07b4f1b..fc97888 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -76,12 +76,22 @@ void wakeup_softirqd(void)
                 wake_up_process(tsk);
  }
  
+/*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ *   softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ *   on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+
  /*
   * This one is for softirq.c-internal use,
   * where hardirqs are disabled legitimately:
   */
  #ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
  {
         unsigned long flags;
  
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
          * We must manually increment preempt_count here and manually
          * call the trace_preempt_off later.
          */
-       preempt_count() += SOFTIRQ_OFFSET;
+       preempt_count() += cnt;
         /*
          * Were softirqs turned off above:
          */
-       if (softirq_count() == SOFTIRQ_OFFSET)
+       if (softirq_count() == cnt)
                 trace_softirqs_off(ip);
         raw_local_irq_restore(flags);
  
-       if (preempt_count() == SOFTIRQ_OFFSET)
+       if (preempt_count() == cnt)
                 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  }
  #else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
  {
-       add_preempt_count(SOFTIRQ_OFFSET);
+       add_preempt_count(cnt);
         barrier();
  }
  #endif /* CONFIG_TRACE_IRQFLAGS */
  
  void local_bh_disable(void)
  {
-       __local_bh_disable((unsigned long)__builtin_return_address(0));
+       __local_bh_disable((unsigned long)__builtin_return_address(0),
+                               SOFTIRQ_DISABLE_OFFSET);
  }
  
  EXPORT_SYMBOL(local_bh_disable);
  
+static void __local_bh_enable(unsigned int cnt)
+{
+       WARN_ON_ONCE(in_irq());
+       WARN_ON_ONCE(!irqs_disabled());
+
+       if (softirq_count() == cnt)
+               trace_softirqs_on((unsigned long)__builtin_return_address(0));
+       sub_preempt_count(cnt);
+}
+
  /*
   * Special-case - softirqs can safely be enabled in
   * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
   */
  void _local_bh_enable(void)
  {
-       WARN_ON_ONCE(in_irq());
-       WARN_ON_ONCE(!irqs_disabled());
-
-       if (softirq_count() == SOFTIRQ_OFFSET)
-               trace_softirqs_on((unsigned long)__builtin_return_address(0));
-       sub_preempt_count(SOFTIRQ_OFFSET);
+       __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
  }
  
  EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
         /*
          * Are softirqs going to be turned on now:
          */
-       if (softirq_count() == SOFTIRQ_OFFSET)
+       if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
                 trace_softirqs_on(ip);
         /*
          * Keep preemption disabled until we are done with
          * softirq processing:
          */
-       sub_preempt_count(SOFTIRQ_OFFSET - 1);
+       sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
  
         if (unlikely(!in_interrupt() && local_softirq_pending()))
                 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
         pending = local_softirq_pending();
         account_system_vtime(current);
  
-       __local_bh_disable((unsigned long)__builtin_return_address(0));
+       __local_bh_disable((unsigned long)__builtin_return_address(0),
+                               SOFTIRQ_OFFSET);
         lockdep_softirq_enter();
  
         cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
         lockdep_softirq_exit();
  
         account_system_vtime(current);
-       _local_bh_enable();
+       __local_bh_enable(SOFTIRQ_OFFSET);
  }
  
  #ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
  
         rcu_irq_enter();
         if (idle_cpu(cpu) && !in_interrupt()) {
-               __irq_enter();
+               /*
+                * Prevent raise_softirq from needlessly waking up ksoftirqd
+                * here, as softirq will be serviced on return from interrupt.
+                */
+               local_bh_disable();
                 tick_check_idle(cpu);
-       } else
-               __irq_enter();
+               _local_bh_enable();
+       }
+
+       __irq_enter();
  }
  
  #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
  {
         set_current_state(TASK_INTERRUPTIBLE);
  
+       current->flags |= PF_KSOFTIRQD;
         while (!kthread_should_stop()) {
                 preempt_disable();
                 if (!local_softirq_pending()) {
@@ -886,17 +910,14 @@ int __init __weak early_irq_init(void)
         return 0;
  }
  
+#ifdef CONFIG_GENERIC_HARDIRQS
  int __init __weak arch_probe_nr_irqs(void)
  {
-       return 0;
+       return NR_IRQS_LEGACY;
  }
  
  int __init __weak arch_early_irq_init(void)
  {
         return 0;
  }
-
-int __weak arch_init_chip_data(struct irq_desc *desc, int node)
-{
-       return 0;
-}
+#endif
diff --git a/kernel/srcu.c b/kernel/srcu.c

index 2980da3..c71e075 100644 (file)
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
  int __init_srcu_struct(struct srcu_struct *sp, const char *name,
                        struct lock_class_key *key)
  {
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
         /* Don't re-initialize a lock while it is held. */
         debug_check_no_locks_freed((void *)sp, sizeof(*sp));
         lockdep_init_map(&sp->dep_map, name, key, 0);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
         return init_srcu_struct_fields(sp);
  }
  EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c

index 4372ccb..090c288 100644 (file)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
         goto repeat;
  }
  
+extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+
  /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
  static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                            unsigned long action, void *hcpu)
  {
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
         unsigned int cpu = (unsigned long)hcpu;
         struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
         struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
                                    cpu);
                 if (IS_ERR(p))
                         return NOTIFY_BAD;
-               sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
                 get_task_struct(p);
+               kthread_bind(p, cpu);
+               sched_set_stop_task(cpu, p);
                 stopper->thread = p;
                 break;
  
         case CPU_ONLINE:
-               kthread_bind(stopper->thread, cpu);
                 /* strictly unnecessary, as first user will wake it */
                 wake_up_process(stopper->thread);
                 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
         {
                 struct cpu_stop_work *work;
  
+               sched_set_stop_task(cpu, NULL);
                 /* kill the stopper */
                 kthread_stop(stopper->thread);
                 /* drain remaining works */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c

index c631168..d232189 100644 (file)
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
         time_reftime = get_seconds();
  
         offset64    = offset;
-       freq_adj    = (offset64 * secs) <<
-                       (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+       freq_adj    = ntp_update_offset_fll(offset64, secs);
  
-       freq_adj    += ntp_update_offset_fll(offset64, secs);
+       /*
+        * Clamp update interval to reduce PLL gain with low
+        * sampling rate (e.g. intermittent network connection)
+        * to avoid instability.
+        */
+       if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
+               secs = 1 << (SHIFT_PLL + 1 + time_constant);
+
+       freq_adj    += (offset64 * secs) <<
+                       (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
  
         freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
  
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c

index 65fb077..ebd80d5 100644 (file)
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
  
         ret = ftrace_avail_open(inode, file);
         if (!ret) {
-               m = (struct seq_file *)file->private_data;
-               iter = (struct ftrace_iterator *)m->private;
+               m = file->private_data;
+               iter = m->private;
                 iter->flags = FTRACE_ITER_FAILURES;
         }
  
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c

index 9ec59f5..001bcd2 100644 (file)
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
  
  static int tracing_release(struct inode *inode, struct file *file)
  {
-       struct seq_file *m = (struct seq_file *)file->private_data;
+       struct seq_file *m = file->private_data;
         struct trace_iterator *iter;
         int cpu;
  
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h

index d39b3c5..9021f8c 100644 (file)
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
                     unsigned long ip,
                     unsigned long parent_ip,
                     unsigned long flags, int pc);
+void trace_graph_function(struct trace_array *tr,
+                   unsigned long ip,
+                   unsigned long parent_ip,
+                   unsigned long flags, int pc);
  void trace_default_header(struct seq_file *m);
  void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
  int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index ef49e93..76b0598 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -262,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
                 return trace_graph_entry(trace);
  }
  
+static void
+__trace_graph_function(struct trace_array *tr,
+               unsigned long ip, unsigned long flags, int pc)
+{
+       u64 time = trace_clock_local();
+       struct ftrace_graph_ent ent = {
+               .func  = ip,
+               .depth = 0,
+       };
+       struct ftrace_graph_ret ret = {
+               .func     = ip,
+               .depth    = 0,
+               .calltime = time,
+               .rettime  = time,
+       };
+
+       __trace_graph_entry(tr, &ent, flags, pc);
+       __trace_graph_return(tr, &ret, flags, pc);
+}
+
+void
+trace_graph_function(struct trace_array *tr,
+               unsigned long ip, unsigned long parent_ip,
+               unsigned long flags, int pc)
+{
+       __trace_graph_function(tr, ip, flags, pc);
+}
+
  void __trace_graph_return(struct trace_array *tr,
                                 struct ftrace_graph_ret *trace,
                                 unsigned long flags,
@@ -888,12 +916,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
                 unsigned long addr, int depth)
  {
         int cpu = iter->cpu;
+       int *depth_irq;
         struct fgraph_data *data = iter->private;
-       int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
  
-       if (flags & TRACE_GRAPH_PRINT_IRQS)
+       /*
+        * If we are either displaying irqs, or we got called as
+        * a graph event and private data does not exist,
+        * then we bypass the irq check.
+        */
+       if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+           (!data))
                 return 0;
  
+       depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
         /*
          * We are inside the irq code
          */
@@ -926,12 +962,20 @@ static int
  check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
  {
         int cpu = iter->cpu;
+       int *depth_irq;
         struct fgraph_data *data = iter->private;
-       int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
  
-       if (flags & TRACE_GRAPH_PRINT_IRQS)
+       /*
+        * If we are either displaying irqs, or we got called as
+        * a graph event and private data does not exist,
+        * then we bypass the irq check.
+        */
+       if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
+           (!data))
                 return 0;
  
+       depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
+
         /*
          * We are not inside the irq code.
          */
@@ -1163,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
  
  
  enum print_line_t
-print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
  {
         struct ftrace_graph_ent_entry *field;
         struct fgraph_data *data = iter->private;
@@ -1226,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
  static enum print_line_t
  print_graph_function(struct trace_iterator *iter)
  {
-       return print_graph_function_flags(iter, tracer_flags.val);
+       return __print_graph_function_flags(iter, tracer_flags.val);
+}
+
+enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
+                                            u32 flags)
+{
+       if (trace_flags & TRACE_ITER_LATENCY_FMT)
+               flags |= TRACE_GRAPH_PRINT_DURATION;
+       else
+               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+       return __print_graph_function_flags(iter, flags);
  }
  
  static enum print_line_t
@@ -1258,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
         seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
  }
  
-void print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
  {
         int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
  
@@ -1299,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
         print_graph_headers_flags(s, tracer_flags.val);
  }
  
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
+{
+       struct trace_iterator *iter = s->private;
+
+       if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+               /* print nothing if the buffers are empty */
+               if (trace_empty(iter))
+                       return;
+
+               print_trace_header(s, iter);
+               flags |= TRACE_GRAPH_PRINT_DURATION;
+       } else
+               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+       __print_graph_headers_flags(s, flags);
+}
+
  void graph_trace_open(struct trace_iterator *iter)
  {
         /* pid and depth on the last trace processed */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c

index 73a6b06..5cf8c60 100644 (file)
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp   unsigned long max_sequence;
  
  #ifdef CONFIG_FUNCTION_TRACER
  /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the preempt and irqs off function tracers.
+ *
+ * Returns 1 if it is OK to continue, and data->disabled is
+ *            incremented.
+ *         0 if the trace is to be ignored, and data->disabled
+ *            is kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
   */
-static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int func_prolog_dec(struct trace_array *tr,
+                          struct trace_array_cpu **data,
+                          unsigned long *flags)
  {
-       struct trace_array *tr = irqsoff_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
         long disabled;
         int cpu;
  
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
          */
         cpu = raw_smp_processor_id();
         if (likely(!per_cpu(tracing_cpu, cpu)))
-               return;
+               return 0;
  
-       local_save_flags(flags);
+       local_save_flags(*flags);
         /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return;
+       if (!irqs_disabled_flags(*flags))
+               return 0;
  
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
+       *data = tr->data[cpu];
+       disabled = atomic_inc_return(&(*data)->disabled);
  
         if (likely(disabled == 1))
-               trace_function(tr, ip, parent_ip, flags, preempt_count());
+               return 1;
+
+       atomic_dec(&(*data)->disabled);
+
+       return 0;
+}
+
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = irqsoff_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+
+       if (!func_prolog_dec(tr, &data, &flags))
+               return;
+
+       trace_function(tr, ip, parent_ip, flags, preempt_count());
  
         atomic_dec(&data->disabled);
  }
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
         struct trace_array *tr = irqsoff_trace;
         struct trace_array_cpu *data;
         unsigned long flags;
-       long disabled;
         int ret;
-       int cpu;
         int pc;
  
-       cpu = raw_smp_processor_id();
-       if (likely(!per_cpu(tracing_cpu, cpu)))
+       if (!func_prolog_dec(tr, &data, &flags))
                 return 0;
  
-       local_save_flags(flags);
-       /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return 0;
-
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               ret = __trace_graph_entry(tr, trace, flags, pc);
-       } else
-               ret = 0;
-
+       pc = preempt_count();
+       ret = __trace_graph_entry(tr, trace, flags, pc);
         atomic_dec(&data->disabled);
+
         return ret;
  }
  
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
         struct trace_array *tr = irqsoff_trace;
         struct trace_array_cpu *data;
         unsigned long flags;
-       long disabled;
-       int cpu;
         int pc;
  
-       cpu = raw_smp_processor_id();
-       if (likely(!per_cpu(tracing_cpu, cpu)))
+       if (!func_prolog_dec(tr, &data, &flags))
                 return;
  
-       local_save_flags(flags);
-       /* slight chance to get a false positive on tracing_cpu */
-       if (!irqs_disabled_flags(flags))
-               return;
-
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
-
-       if (likely(disabled == 1)) {
-               pc = preempt_count();
-               __trace_graph_return(tr, trace, flags, pc);
-       }
-
+       pc = preempt_count();
+       __trace_graph_return(tr, trace, flags, pc);
         atomic_dec(&data->disabled);
  }
  
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
  
  static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
  {
-       u32 flags = GRAPH_TRACER_FLAGS;
-
-       if (trace_flags & TRACE_ITER_LATENCY_FMT)
-               flags |= TRACE_GRAPH_PRINT_DURATION;
-       else
-               flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
         /*
          * In graph mode call the graph tracer output function,
          * otherwise go with the TRACE_FN event handler
          */
         if (is_graph())
-               return print_graph_function_flags(iter, flags);
+               return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
  
         return TRACE_TYPE_UNHANDLED;
  }
  
  static void irqsoff_print_header(struct seq_file *s)
  {
-       if (is_graph()) {
-               struct trace_iterator *iter = s->private;
-               u32 flags = GRAPH_TRACER_FLAGS;
-
-               if (trace_flags & TRACE_ITER_LATENCY_FMT) {
-                       /* print nothing if the buffers are empty */
-                       if (trace_empty(iter))
-                               return;
-
-                       print_trace_header(s, iter);
-                       flags |= TRACE_GRAPH_PRINT_DURATION;
-               } else
-                       flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
-               print_graph_headers_flags(s, flags);
-       } else
+       if (is_graph())
+               print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+       else
                 trace_default_header(s);
  }
  
-static void
-trace_graph_function(struct trace_array *tr,
-                unsigned long ip, unsigned long flags, int pc)
-{
-       u64 time = trace_clock_local();
-       struct ftrace_graph_ent ent = {
-               .func  = ip,
-               .depth = 0,
-       };
-       struct ftrace_graph_ret ret = {
-               .func     = ip,
-               .depth    = 0,
-               .calltime = time,
-               .rettime  = time,
-       };
-
-       __trace_graph_entry(tr, &ent, flags, pc);
-       __trace_graph_return(tr, &ret, flags, pc);
-}
-
  static void
  __trace_function(struct trace_array *tr,
                  unsigned long ip, unsigned long parent_ip,
                  unsigned long flags, int pc)
  {
-       if (!is_graph())
+       if (is_graph())
+               trace_graph_function(tr, ip, parent_ip, flags, pc);
+       else
                 trace_function(tr, ip, parent_ip, flags, pc);
-       else {
-               trace_graph_function(tr, parent_ip, flags, pc);
-               trace_graph_function(tr, ip, flags, pc);
-       }
  }
  
  #else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c

index 4086eae..7319559 100644 (file)
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int                  wakeup_rt;
  static arch_spinlock_t wakeup_lock =
         (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
  
+static void wakeup_reset(struct trace_array *tr);
  static void __wakeup_reset(struct trace_array *tr);
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
  
  static int save_lat_flag;
  
+#define TRACE_DISPLAY_GRAPH     1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* display latency trace as call graph */
+       { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+       { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+       .val  = 0,
+       .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
  #ifdef CONFIG_FUNCTION_TRACER
+
  /*
- * irqsoff uses its own tracer function to keep the overhead down:
+ * Prologue for the wakeup function tracers.
+ *
+ * Returns 1 if it is OK to continue, and preemption
+ *            is disabled and data->disabled is incremented.
+ *         0 if the trace is to be ignored, and preemption
+ *            is not disabled and data->disabled is
+ *            kept the same.
+ *
+ * Note, this function is also used outside this ifdef but
+ *  inside the #ifdef of the function graph tracer below.
+ *  This is OK, since the function graph tracer is
+ *  dependent on the function tracer.
   */
-static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+static int
+func_prolog_preempt_disable(struct trace_array *tr,
+                           struct trace_array_cpu **data,
+                           int *pc)
  {
-       struct trace_array *tr = wakeup_trace;
-       struct trace_array_cpu *data;
-       unsigned long flags;
         long disabled;
         int cpu;
-       int pc;
  
         if (likely(!wakeup_task))
-               return;
+               return 0;
  
-       pc = preempt_count();
+       *pc = preempt_count();
         preempt_disable_notrace();
  
         cpu = raw_smp_processor_id();
         if (cpu != wakeup_current_cpu)
                 goto out_enable;
  
-       data = tr->data[cpu];
-       disabled = atomic_inc_return(&data->disabled);
+       *data = tr->data[cpu];
+       disabled = atomic_inc_return(&(*data)->disabled);
         if (unlikely(disabled != 1))
                 goto out;
  
-       local_irq_save(flags);
+       return 1;
  
-       trace_function(tr, ip, parent_ip, flags, pc);
+out:
+       atomic_dec(&(*data)->disabled);
+
+out_enable:
+       preempt_enable_notrace();
+       return 0;
+}
  
+/*
+ * wakeup uses its own tracer function to keep the overhead down:
+ */
+static void
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return;
+
+       local_irq_save(flags);
+       trace_function(tr, ip, parent_ip, flags, pc);
         local_irq_restore(flags);
  
- out:
         atomic_dec(&data->disabled);
- out_enable:
         preempt_enable_notrace();
  }
  
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
  };
  #endif /* CONFIG_FUNCTION_TRACER */
  
+static int start_func_tracer(int graph)
+{
+       int ret;
+
+       if (!graph)
+               ret = register_ftrace_function(&trace_ops);
+       else
+               ret = register_ftrace_graph(&wakeup_graph_return,
+                                           &wakeup_graph_entry);
+
+       if (!ret && tracing_is_enabled())
+               tracer_enabled = 1;
+       else
+               tracer_enabled = 0;
+
+       return ret;
+}
+
+static void stop_func_tracer(int graph)
+{
+       tracer_enabled = 0;
+
+       if (!graph)
+               unregister_ftrace_function(&trace_ops);
+       else
+               unregister_ftrace_graph();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+
+       if (!(bit & TRACE_DISPLAY_GRAPH))
+               return -EINVAL;
+
+       if (!(is_graph() ^ set))
+               return 0;
+
+       stop_func_tracer(!set);
+
+       wakeup_reset(wakeup_trace);
+       tracing_max_latency = 0;
+
+       return start_func_tracer(set);
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc, ret = 0;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return 0;
+
+       local_save_flags(flags);
+       ret = __trace_graph_entry(tr, trace, flags, pc);
+       atomic_dec(&data->disabled);
+       preempt_enable_notrace();
+
+       return ret;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct trace_array *tr = wakeup_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       int pc;
+
+       if (!func_prolog_preempt_disable(tr, &data, &pc))
+               return;
+
+       local_save_flags(flags);
+       __trace_graph_return(tr, trace, flags, pc);
+       atomic_dec(&data->disabled);
+
+       preempt_enable_notrace();
+       return;
+}
+
+static void wakeup_trace_open(struct trace_iterator *iter)
+{
+       if (is_graph())
+               graph_trace_open(iter);
+}
+
+static void wakeup_trace_close(struct trace_iterator *iter)
+{
+       if (iter->private)
+               graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+       /*
+        * In graph mode call the graph tracer output function,
+        * otherwise go with the TRACE_FN event handler
+        */
+       if (is_graph())
+               return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
+
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_print_header(struct seq_file *s)
+{
+       if (is_graph())
+               print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
+       else
+               trace_default_header(s);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+                unsigned long ip, unsigned long parent_ip,
+                unsigned long flags, int pc)
+{
+       if (is_graph())
+               trace_graph_function(tr, ip, parent_ip, flags, pc);
+       else
+               trace_function(tr, ip, parent_ip, flags, pc);
+}
+#else
+#define __trace_function trace_function
+
+static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+{
+       return -EINVAL;
+}
+
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+       return -1;
+}
+
+static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
+{
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
+static void wakeup_print_header(struct seq_file *s) { }
+static void wakeup_trace_open(struct trace_iterator *iter) { }
+static void wakeup_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
  /*
   * Should this new latency be reported/recorded?
   */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
         /* The task we are waiting for is waking up */
         data = wakeup_trace->data[wakeup_cpu];
  
-       trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
+       __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
         tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
  
         T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
          * is not called by an assembly function  (where as schedule is)
          * it should be safe to use it here.
          */
-       trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+       __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
  
  out_locked:
         arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
          */
         smp_wmb();
  
-       register_ftrace_function(&trace_ops);
-
-       if (tracing_is_enabled())
-               tracer_enabled = 1;
-       else
-               tracer_enabled = 0;
+       if (start_func_tracer(is_graph()))
+               printk(KERN_ERR "failed to start wakeup tracer\n");
  
         return;
  fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
  static void stop_wakeup_tracer(struct trace_array *tr)
  {
         tracer_enabled = 0;
-       unregister_ftrace_function(&trace_ops);
+       stop_func_tracer(is_graph());
         unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
         unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
         unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
         .start          = wakeup_tracer_start,
         .stop           = wakeup_tracer_stop,
         .print_max      = 1,
+       .print_header   = wakeup_print_header,
+       .print_line     = wakeup_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = wakeup_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest    = trace_selftest_startup_wakeup,
  #endif
+       .open           = wakeup_trace_open,
+       .close          = wakeup_trace_close,
         .use_max_tr     = 1,
  };
  
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
         .stop           = wakeup_tracer_stop,
         .wait_pipe      = poll_wait_pipe,
         .print_max      = 1,
+       .print_header   = wakeup_print_header,
+       .print_line     = wakeup_print_line,
+       .flags          = &tracer_flags,
+       .set_flag       = wakeup_set_flag,
  #ifdef CONFIG_FTRACE_SELFTEST
         .selftest    = trace_selftest_startup_wakeup,
  #endif
+       .open           = wakeup_trace_open,
+       .close          = wakeup_trace_close,
         .use_max_tr     = 1,
  };
  
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index dc8e168..bafba68 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
  };
  
  /* Callback function for perf event subsystem */
-void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event, int nmi,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
  {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index e85d549..21ac830 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -540,6 +540,23 @@ config PROVE_RCU_REPEATEDLY
          disabling, allowing multiple RCU-lockdep warnings to be printed
          on a single reboot.
  
+        Say Y to allow multiple RCU-lockdep warnings per boot.
+
+        Say N if you are unsure.
+
+config SPARSE_RCU_POINTER
+       bool "RCU debugging: sparse-based checks for pointer usage"
+       default n
+       help
+        This feature enables the __rcu sparse annotation for
+        RCU-protected pointers.  This annotation will cause sparse
+        to flag any non-RCU used of annotated pointers.  This can be
+        helpful when debugging RCU usage.  Please note that this feature
+        is not intended to enforce code cleanliness; it is instead merely
+        a debugging aid.
+
+        Say Y to make sparse flag questionable use of RCU-protected pointers
+
          Say N if you are unsure.
  
  config LOCKDEP
@@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR
  
           Say Y if you are unsure.
  
+config RCU_CPU_STALL_TIMEOUT
+       int "RCU CPU stall timeout in seconds"
+       depends on RCU_CPU_STALL_DETECTOR
+       range 3 300
+       default 60
+       help
+         If a given RCU grace period extends more than the specified
+         number of seconds, a CPU stall warning is printed.  If the
+         RCU grace period persists, additional CPU stall warnings are
+         printed at more widely spaced intervals.
+
+config RCU_CPU_STALL_DETECTOR_RUNNABLE
+       bool "RCU CPU stall checking starts automatically at boot"
+       depends on RCU_CPU_STALL_DETECTOR
+       default y
+       help
+         If set, start checking for RCU CPU stalls immediately on
+         boot.  Otherwise, RCU CPU stall checking must be manually
+         enabled.
+
+         Say Y if you are unsure.
+
+         Say N if you wish to suppress RCU CPU stall checking during boot.
+
  config RCU_CPU_STALL_VERBOSE
         bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
         depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
diff --git a/lib/radix-tree.c b/lib/radix-tree.c

index efd16fa..6f412ab 100644 (file)
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
         unsigned int    height;         /* Height from the bottom */
         unsigned int    count;
         struct rcu_head rcu_head;
-       void            *slots[RADIX_TREE_MAP_SIZE];
+       void __rcu      *slots[RADIX_TREE_MAP_SIZE];
         unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
  };
  
diff --git a/lib/swiotlb.c b/lib/swiotlb.c

index 34e3082..7c06ee5 100644 (file)
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs;
   */
  static unsigned long io_tlb_overflow = 32*1024;
  
-void *io_tlb_overflow_buffer;
+static void *io_tlb_overflow_buffer;
  
  /*
   * This is a free list describing the number of free entries available from
@@ -147,16 +147,16 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
          * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
          * between io_tlb_start and io_tlb_end.
          */
-       io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+       io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
         for (i = 0; i < io_tlb_nslabs; i++)
                 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
         io_tlb_index = 0;
-       io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
+       io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
  
         /*
          * Get the overflow emergency buffer
          */
-       io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+       io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
         if (!io_tlb_overflow_buffer)
                 panic("Cannot allocate SWIOTLB overflow buffer!\n");
         if (verbose)
@@ -182,7 +182,7 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
         /*
          * Get IO TLB memory from the low pages
          */
-       io_tlb_start = alloc_bootmem_low_pages(bytes);
+       io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
         if (!io_tlb_start)
                 panic("Cannot allocate SWIOTLB buffer");
  
@@ -308,13 +308,13 @@ void __init swiotlb_free(void)
                            get_order(io_tlb_nslabs << IO_TLB_SHIFT));
         } else {
                 free_bootmem_late(__pa(io_tlb_overflow_buffer),
-                                 io_tlb_overflow);
+                                 PAGE_ALIGN(io_tlb_overflow));
                 free_bootmem_late(__pa(io_tlb_orig_addr),
-                                 io_tlb_nslabs * sizeof(phys_addr_t));
+                                 PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
                 free_bootmem_late(__pa(io_tlb_list),
-                                 io_tlb_nslabs * sizeof(int));
+                                 PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
                 free_bootmem_late(__pa(io_tlb_start),
-                                 io_tlb_nslabs << IO_TLB_SHIFT);
+                                 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
         }
  }
  
diff --git a/mm/memory.c b/mm/memory.c

index 0e18b4d..98b58fe 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3185,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                  * with threads.
                  */
                 if (flags & FAULT_FLAG_WRITE)
-                       flush_tlb_page(vma, address);
+                       flush_tlb_fix_spurious_fault(vma, address);
         }
  unlock:
         pte_unmap_unlock(pte, ptl);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index 6b8889d..d8087f0 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -516,6 +516,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
  /* for per-CPU blocks */
  static void purge_fragmented_blocks_allcpus(void);
  
+/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+       atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
  /*
   * Purges all lazily-freed vmap areas.
   *
diff --git a/net/Kconfig b/net/Kconfig

index e926884..55fd82e 100644 (file)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
  source "net/rfkill/Kconfig"
  source "net/9p/Kconfig"
  source "net/caif/Kconfig"
+source "net/ceph/Kconfig"
  
  
  endif   # if NET
diff --git a/net/Makefile b/net/Makefile

index ea60fbc..6b7bfd7 100644 (file)
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL)          += sysctl_net.o
  endif
  obj-$(CONFIG_WIMAX)            += wimax/
  obj-$(CONFIG_DNS_RESOLVER)     += dns_resolver/
+obj-$(CONFIG_CEPH_LIB)         += ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig

new file mode 100644 (file)

index 0000000..ad42404
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
+config CEPH_LIB
+        tristate "Ceph core library (EXPERIMENTAL)"
+       depends on INET && EXPERIMENTAL
+       select LIBCRC32C
+       select CRYPTO_AES
+       select CRYPTO
+       default n
+       help
+         Choose Y or M here to include cephlib, which provides the
+         common functionality to both the Ceph filesystem and
+         to the rados block device (rbd).
+
+         More information at http://ceph.newdream.net/.
+
+         If unsure, say N.
+
+config CEPH_LIB_PRETTYDEBUG
+       bool "Include file:line in ceph debug output"
+       depends on CEPH_LIB
+       default n
+       help
+         If you say Y here, debug output will include a filename and
+         line to aid debugging.  This increases kernel size and slows
+         execution slightly when debug call sites are enabled (e.g.,
+         via CONFIG_DYNAMIC_DEBUG).
+
+         If unsure, say N.
+
diff --git a/net/ceph/Makefile b/net/ceph/Makefile

new file mode 100644 (file)

index 0000000..aab1cab
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_LIB) += libceph.o
+
+libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
+       mon_client.o \
+       osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+       debugfs.o \
+       auth.o auth_none.o \
+       crypto.o armor.o \
+       auth_x.o \
+       ceph_fs.o ceph_strings.o ceph_hash.o \
+       pagevec.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
+
+modules_install:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
+
+clean:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/net/ceph/armor.c b/net/ceph/armor.c

new file mode 100644 (file)

index 0000000..eb2a666
--- /dev/null
+++ b/net/ceph/armor.c
@@ -0,0 +1,103 @@
+
+#include <linux/errno.h>
+
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
+
+/*
+ * base64 encode/decode.
+ */
+
+static const char *pem_key =
+       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+       return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+       if (c >= 'A' && c <= 'Z')
+               return c - 'A';
+       if (c >= 'a' && c <= 'z')
+               return c - 'a' + 26;
+       if (c >= '0' && c <= '9')
+               return c - '0' + 52;
+       if (c == '+')
+               return 62;
+       if (c == '/')
+               return 63;
+       if (c == '=')
+               return 0; /* just non-negative, please */
+       return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+       int olen = 0;
+       int line = 0;
+
+       while (src < end) {
+               unsigned char a, b, c;
+
+               a = *src++;
+               *dst++ = encode_bits(a >> 2);
+               if (src < end) {
+                       b = *src++;
+                       *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+                       if (src < end) {
+                               c = *src++;
+                               *dst++ = encode_bits(((b & 15) << 2) |
+                                                    (c >> 6));
+                               *dst++ = encode_bits(c & 63);
+                       } else {
+                               *dst++ = encode_bits((b & 15) << 2);
+                               *dst++ = '=';
+                       }
+               } else {
+                       *dst++ = encode_bits(((a & 3) << 4));
+                       *dst++ = '=';
+                       *dst++ = '=';
+               }
+               olen += 4;
+               line += 4;
+               if (line == 64) {
+                       line = 0;
+                       *(dst++) = '\n';
+                       olen++;
+               }
+       }
+       return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+       int olen = 0;
+
+       while (src < end) {
+               int a, b, c, d;
+
+               if (src < end && src[0] == '\n')
+                       src++;
+               if (src + 4 > end)
+                       return -EINVAL;
+               a = decode_bits(src[0]);
+               b = decode_bits(src[1]);
+               c = decode_bits(src[2]);
+               d = decode_bits(src[3]);
+               if (a < 0 || b < 0 || c < 0 || d < 0)
+                       return -EINVAL;
+
+               *dst++ = (a << 2) | (b >> 4);
+               if (src[2] == '=')
+                       return olen + 1;
+               *dst++ = ((b & 15) << 4) | (c >> 2);
+               if (src[3] == '=')
+                       return olen + 2;
+               *dst++ = ((c & 3) << 6) | d;
+               olen += 3;
+               src += 4;
+       }
+       return olen;
+}
diff --git a/net/ceph/auth.c b/net/ceph/auth.c

new file mode 100644 (file)

index 0000000..549c1f4
--- /dev/null
+++ b/net/ceph/auth.c
@@ -0,0 +1,259 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include "auth_none.h"
+#include "auth_x.h"
+
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+       CEPH_AUTH_NONE,
+       CEPH_AUTH_CEPHX
+};
+
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+       switch (protocol) {
+       case CEPH_AUTH_NONE:
+               return ceph_auth_none_init(ac);
+       case CEPH_AUTH_CEPHX:
+               return ceph_x_init(ac);
+       default:
+               return -ENOENT;
+       }
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+       struct ceph_auth_client *ac;
+       int ret;
+
+       dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+       ret = -ENOMEM;
+       ac = kzalloc(sizeof(*ac), GFP_NOFS);
+       if (!ac)
+               goto out;
+
+       ac->negotiating = true;
+       if (name)
+               ac->name = name;
+       else
+               ac->name = CEPH_AUTH_NAME_DEFAULT;
+       dout("auth_init name %s secret %s\n", ac->name, secret);
+       ac->secret = secret;
+       return ac;
+
+out:
+       return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+       dout("auth_destroy %p\n", ac);
+       if (ac->ops)
+               ac->ops->destroy(ac);
+       kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+       dout("auth_reset %p\n", ac);
+       if (ac->ops && !ac->negotiating)
+               ac->ops->reset(ac);
+       ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+       int len = strlen(name);
+
+       if (*p + 2*sizeof(u32) + len > end)
+               return -ERANGE;
+       ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+       ceph_encode_32(p, len);
+       ceph_encode_copy(p, name, len);
+       return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+       struct ceph_mon_request_header *monhdr = buf;
+       void *p = monhdr + 1, *end = buf + len, *lenp;
+       int i, num;
+       int ret;
+
+       dout("auth_build_hello\n");
+       monhdr->have_version = 0;
+       monhdr->session_mon = cpu_to_le16(-1);
+       monhdr->session_mon_tid = 0;
+
+       ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+       lenp = p;
+       p += sizeof(u32);
+
+       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+       ceph_encode_8(&p, 1);
+       num = ARRAY_SIZE(supported_protocols);
+       ceph_encode_32(&p, num);
+       ceph_decode_need(&p, end, num * sizeof(u32), bad);
+       for (i = 0; i < num; i++)
+               ceph_encode_32(&p, supported_protocols[i]);
+
+       ret = ceph_entity_name_encode(ac->name, &p, end);
+       if (ret < 0)
+               return ret;
+       ceph_decode_need(&p, end, sizeof(u64), bad);
+       ceph_encode_64(&p, ac->global_id);
+
+       ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+       return p - buf;
+
+bad:
+       return -ERANGE;
+}
+
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
+                                  void *msg_buf, size_t msg_len)
+{
+       struct ceph_mon_request_header *monhdr = msg_buf;
+       void *p = monhdr + 1;
+       void *end = msg_buf + msg_len;
+       int ret;
+
+       monhdr->have_version = 0;
+       monhdr->session_mon = cpu_to_le16(-1);
+       monhdr->session_mon_tid = 0;
+
+       ceph_encode_32(&p, ac->protocol);
+
+       ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+       if (ret < 0) {
+               pr_err("error %d building auth method %s request\n", ret,
+                      ac->ops->name);
+               return ret;
+       }
+       dout(" built request %d bytes\n", ret);
+       ceph_encode_32(&p, ret);
+       return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+                          void *buf, size_t len,
+                          void *reply_buf, size_t reply_len)
+{
+       void *p = buf;
+       void *end = buf + len;
+       int protocol;
+       s32 result;
+       u64 global_id;
+       void *payload, *payload_end;
+       int payload_len;
+       char *result_msg;
+       int result_msg_len;
+       int ret = -EINVAL;
+
+       dout("handle_auth_reply %p %p\n", p, end);
+       ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+       protocol = ceph_decode_32(&p);
+       result = ceph_decode_32(&p);
+       global_id = ceph_decode_64(&p);
+       payload_len = ceph_decode_32(&p);
+       payload = p;
+       p += payload_len;
+       ceph_decode_need(&p, end, sizeof(u32), bad);
+       result_msg_len = ceph_decode_32(&p);
+       result_msg = p;
+       p += result_msg_len;
+       if (p != end)
+               goto bad;
+
+       dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+            result_msg, global_id, payload_len);
+
+       payload_end = payload + payload_len;
+
+       if (global_id && ac->global_id != global_id) {
+               dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+               ac->global_id = global_id;
+       }
+
+       if (ac->negotiating) {
+               /* server does not support our protocols? */
+               if (!protocol && result < 0) {
+                       ret = result;
+                       goto out;
+               }
+               /* set up (new) protocol handler? */
+               if (ac->protocol && ac->protocol != protocol) {
+                       ac->ops->destroy(ac);
+                       ac->protocol = 0;
+                       ac->ops = NULL;
+               }
+               if (ac->protocol != protocol) {
+                       ret = ceph_auth_init_protocol(ac, protocol);
+                       if (ret) {
+                               pr_err("error %d on auth protocol %d init\n",
+                                      ret, protocol);
+                               goto out;
+                       }
+               }
+
+               ac->negotiating = false;
+       }
+
+       ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+       if (ret == -EAGAIN) {
+               return ceph_build_auth_request(ac, reply_buf, reply_len);
+       } else if (ret) {
+               pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+               return ret;
+       }
+       return 0;
+
+bad:
+       pr_err("failed to decode auth msg\n");
+out:
+       return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+                   void *msg_buf, size_t msg_len)
+{
+       if (!ac->protocol)
+               return ceph_auth_build_hello(ac, msg_buf, msg_len);
+       BUG_ON(!ac->ops);
+       if (ac->ops->should_authenticate(ac))
+               return ceph_build_auth_request(ac, msg_buf, msg_len);
+       return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+       if (!ac->ops)
+               return 0;
+       return ac->ops->is_authenticated(ac);
+}
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c

new file mode 100644 (file)

index 0000000..214c2bb
--- /dev/null
+++ b/net/ceph/auth_none.c
@@ -0,0 +1,132 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "auth_none.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       xi->starting = true;
+       xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+       kfree(ac->private);
+       ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       return !xi->starting;
+}
+
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       return xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+                       void *buf, void *end)
+{
+       struct ceph_auth_none_info *xi = ac->private;
+
+       xi->starting = false;
+       return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+       struct ceph_auth_client *ac, int peer_type,
+       struct ceph_authorizer **a,
+       void **buf, size_t *len,
+       void **reply_buf, size_t *reply_len)
+{
+       struct ceph_auth_none_info *ai = ac->private;
+       struct ceph_none_authorizer *au = &ai->au;
+       void *p, *end;
+       int ret;
+
+       if (!ai->built_authorizer) {
+               p = au->buf;
+               end = p + sizeof(au->buf);
+               ceph_encode_8(&p, 1);
+               ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+               if (ret < 0)
+                       goto bad;
+               ceph_decode_need(&p, end, sizeof(u64), bad2);
+               ceph_encode_64(&p, ac->global_id);
+               au->buf_len = p - (void *)au->buf;
+               ai->built_authorizer = true;
+               dout("built authorizer len %d\n", au->buf_len);
+       }
+
+       *a = (struct ceph_authorizer *)au;
+       *buf = au->buf;
+       *len = au->buf_len;
+       *reply_buf = au->reply_buf;
+       *reply_len = sizeof(au->reply_buf);
+       return 0;
+
+bad2:
+       ret = -ERANGE;
+bad:
+       return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+                                     struct ceph_authorizer *a)
+{
+       /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+       .name = "none",
+       .reset = reset,
+       .destroy = destroy,
+       .is_authenticated = is_authenticated,
+       .should_authenticate = should_authenticate,
+       .handle_reply = handle_reply,
+       .create_authorizer = ceph_auth_none_create_authorizer,
+       .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+       struct ceph_auth_none_info *xi;
+
+       dout("ceph_auth_none_init %p\n", ac);
+       xi = kzalloc(sizeof(*xi), GFP_NOFS);
+       if (!xi)
+               return -ENOMEM;
+
+       xi->starting = true;
+       xi->built_authorizer = false;
+
+       ac->protocol = CEPH_AUTH_NONE;
+       ac->private = xi;
+       ac->ops = &ceph_auth_none_ops;
+       return 0;
+}
+
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h

new file mode 100644 (file)

index 0000000..ed7d088
--- /dev/null
+++ b/net/ceph/auth_none.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include <linux/slab.h>
+#include <linux/ceph/auth.h>
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+       char buf[128];
+       int buf_len;
+       char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+       bool starting;
+       bool built_authorizer;
+       struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c

new file mode 100644 (file)

index 0000000..7fd5dfc
--- /dev/null
+++ b/net/ceph/auth_x.c
@@ -0,0 +1,688 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+
+#define TEMP_TICKET_BUF_LEN    256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+       int need;
+
+       ceph_x_validate_tickets(ac, &need);
+       dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+            ac->want_keys, need, xi->have_keys);
+       return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+       int need;
+
+       ceph_x_validate_tickets(ac, &need);
+       dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+            ac->want_keys, need, xi->have_keys);
+       return need != 0;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+       return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+               sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+                         void *ibuf, int ilen, void *obuf, size_t olen)
+{
+       struct ceph_x_encrypt_header head = {
+               .struct_v = 1,
+               .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+       };
+       size_t len = olen - sizeof(u32);
+       int ret;
+
+       ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+                           &head, sizeof(head), ibuf, ilen);
+       if (ret)
+               return ret;
+       ceph_encode_32(&obuf, len);
+       return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+                         void **p, void *end, void *obuf, size_t olen)
+{
+       struct ceph_x_encrypt_header head;
+       size_t head_len = sizeof(head);
+       int len, ret;
+
+       len = ceph_decode_32(p);
+       if (*p + len > end)
+               return -EINVAL;
+
+       dout("ceph_x_decrypt len %d\n", len);
+       ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+                           *p, len);
+       if (ret)
+               return ret;
+       if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+               return -EPERM;
+       *p += len;
+       return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+static struct ceph_x_ticket_handler *
+get_ticket_handler(struct ceph_auth_client *ac, int service)
+{
+       struct ceph_x_ticket_handler *th;
+       struct ceph_x_info *xi = ac->private;
+       struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+       while (*p) {
+               parent = *p;
+               th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+               if (service < th->service)
+                       p = &(*p)->rb_left;
+               else if (service > th->service)
+                       p = &(*p)->rb_right;
+               else
+                       return th;
+       }
+
+       /* add it */
+       th = kzalloc(sizeof(*th), GFP_NOFS);
+       if (!th)
+               return ERR_PTR(-ENOMEM);
+       th->service = service;
+       rb_link_node(&th->node, parent, p);
+       rb_insert_color(&th->node, &xi->ticket_handlers);
+       return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+                                 struct ceph_x_ticket_handler *th)
+{
+       struct ceph_x_info *xi = ac->private;
+
+       dout("remove_ticket_handler %p %d\n", th, th->service);
+       rb_erase(&th->node, &xi->ticket_handlers);
+       ceph_crypto_key_destroy(&th->session_key);
+       if (th->ticket_blob)
+               ceph_buffer_put(th->ticket_blob);
+       kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+                                   struct ceph_crypto_key *secret,
+                                   void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       int num;
+       void *p = buf;
+       int ret;
+       char *dbuf;
+       char *ticket_buf;
+       u8 reply_struct_v;
+
+       dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+       if (!dbuf)
+               return -ENOMEM;
+
+       ret = -ENOMEM;
+       ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
+       if (!ticket_buf)
+               goto out_dbuf;
+
+       ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+       reply_struct_v = ceph_decode_8(&p);
+       if (reply_struct_v != 1)
+               goto bad;
+       num = ceph_decode_32(&p);
+       dout("%d tickets\n", num);
+       while (num--) {
+               int type;
+               u8 tkt_struct_v, blob_struct_v;
+               struct ceph_x_ticket_handler *th;
+               void *dp, *dend;
+               int dlen;
+               char is_enc;
+               struct timespec validity;
+               struct ceph_crypto_key old_key;
+               void *tp, *tpend;
+               struct ceph_timespec new_validity;
+               struct ceph_crypto_key new_session_key;
+               struct ceph_buffer *new_ticket_blob;
+               unsigned long new_expires, new_renew_after;
+               u64 new_secret_id;
+
+               ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+               type = ceph_decode_32(&p);
+               dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+               tkt_struct_v = ceph_decode_8(&p);
+               if (tkt_struct_v != 1)
+                       goto bad;
+
+               th = get_ticket_handler(ac, type);
+               if (IS_ERR(th)) {
+                       ret = PTR_ERR(th);
+                       goto out;
+               }
+
+               /* blob for me */
+               dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+                                     TEMP_TICKET_BUF_LEN);
+               if (dlen <= 0) {
+                       ret = dlen;
+                       goto out;
+               }
+               dout(" decrypted %d bytes\n", dlen);
+               dend = dbuf + dlen;
+               dp = dbuf;
+
+               tkt_struct_v = ceph_decode_8(&dp);
+               if (tkt_struct_v != 1)
+                       goto bad;
+
+               memcpy(&old_key, &th->session_key, sizeof(old_key));
+               ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+               if (ret)
+                       goto out;
+
+               ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+               ceph_decode_timespec(&validity, &new_validity);
+               new_expires = get_seconds() + validity.tv_sec;
+               new_renew_after = new_expires - (validity.tv_sec / 4);
+               dout(" expires=%lu renew_after=%lu\n", new_expires,
+                    new_renew_after);
+
+               /* ticket blob for service */
+               ceph_decode_8_safe(&p, end, is_enc, bad);
+               tp = ticket_buf;
+               if (is_enc) {
+                       /* encrypted */
+                       dout(" encrypted ticket\n");
+                       dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+                                             TEMP_TICKET_BUF_LEN);
+                       if (dlen < 0) {
+                               ret = dlen;
+                               goto out;
+                       }
+                       dlen = ceph_decode_32(&tp);
+               } else {
+                       /* unencrypted */
+                       ceph_decode_32_safe(&p, end, dlen, bad);
+                       ceph_decode_need(&p, end, dlen, bad);
+                       ceph_decode_copy(&p, ticket_buf, dlen);
+               }
+               tpend = tp + dlen;
+               dout(" ticket blob is %d bytes\n", dlen);
+               ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+               blob_struct_v = ceph_decode_8(&tp);
+               new_secret_id = ceph_decode_64(&tp);
+               ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+               if (ret)
+                       goto out;
+
+               /* all is well, update our ticket */
+               ceph_crypto_key_destroy(&th->session_key);
+               if (th->ticket_blob)
+                       ceph_buffer_put(th->ticket_blob);
+               th->session_key = new_session_key;
+               th->ticket_blob = new_ticket_blob;
+               th->validity = new_validity;
+               th->secret_id = new_secret_id;
+               th->expires = new_expires;
+               th->renew_after = new_renew_after;
+               dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+                    type, ceph_entity_type_name(type), th->secret_id,
+                    (int)th->ticket_blob->vec.iov_len);
+               xi->have_keys |= th->service;
+       }
+
+       ret = 0;
+out:
+       kfree(ticket_buf);
+out_dbuf:
+       kfree(dbuf);
+       return ret;
+
+bad:
+       ret = -EINVAL;
+       goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+                                  struct ceph_x_ticket_handler *th,
+                                  struct ceph_x_authorizer *au)
+{
+       int maxlen;
+       struct ceph_x_authorize_a *msg_a;
+       struct ceph_x_authorize_b msg_b;
+       void *p, *end;
+       int ret;
+       int ticket_blob_len =
+               (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+       dout("build_authorizer for %s %p\n",
+            ceph_entity_type_name(th->service), au);
+
+       maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+               ceph_x_encrypt_buflen(ticket_blob_len);
+       dout("  need len %d\n", maxlen);
+       if (au->buf && au->buf->alloc_len < maxlen) {
+               ceph_buffer_put(au->buf);
+               au->buf = NULL;
+       }
+       if (!au->buf) {
+               au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+               if (!au->buf)
+                       return -ENOMEM;
+       }
+       au->service = th->service;
+
+       msg_a = au->buf->vec.iov_base;
+       msg_a->struct_v = 1;
+       msg_a->global_id = cpu_to_le64(ac->global_id);
+       msg_a->service_id = cpu_to_le32(th->service);
+       msg_a->ticket_blob.struct_v = 1;
+       msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+       msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+       if (ticket_blob_len) {
+               memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+                      th->ticket_blob->vec.iov_len);
+       }
+       dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+            le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+       p = msg_a + 1;
+       p += ticket_blob_len;
+       end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+       get_random_bytes(&au->nonce, sizeof(au->nonce));
+       msg_b.struct_v = 1;
+       msg_b.nonce = cpu_to_le64(au->nonce);
+       ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+                            p, end - p);
+       if (ret < 0)
+               goto out_buf;
+       p += ret;
+       au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+       dout(" built authorizer nonce %llx len %d\n", au->nonce,
+            (int)au->buf->vec.iov_len);
+       BUG_ON(au->buf->vec.iov_len > maxlen);
+       return 0;
+
+out_buf:
+       ceph_buffer_put(au->buf);
+       au->buf = NULL;
+       return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+                               void **p, void *end)
+{
+       ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+       ceph_encode_8(p, 1);
+       ceph_encode_64(p, th->secret_id);
+       if (th->ticket_blob) {
+               const char *buf = th->ticket_blob->vec.iov_base;
+               u32 len = th->ticket_blob->vec.iov_len;
+
+               ceph_encode_32_safe(p, end, len, bad);
+               ceph_encode_copy_safe(p, end, buf, len, bad);
+       } else {
+               ceph_encode_32_safe(p, end, 0, bad);
+       }
+
+       return 0;
+bad:
+       return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+       int want = ac->want_keys;
+       struct ceph_x_info *xi = ac->private;
+       int service;
+
+       *pneed = ac->want_keys & ~(xi->have_keys);
+
+       for (service = 1; service <= want; service <<= 1) {
+               struct ceph_x_ticket_handler *th;
+
+               if (!(ac->want_keys & service))
+                       continue;
+
+               if (*pneed & service)
+                       continue;
+
+               th = get_ticket_handler(ac, service);
+
+               if (IS_ERR(th)) {
+                       *pneed |= service;
+                       continue;
+               }
+
+               if (get_seconds() >= th->renew_after)
+                       *pneed |= service;
+               if (get_seconds() >= th->expires)
+                       xi->have_keys &= ~service;
+       }
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+                               void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       int need;
+       struct ceph_x_request_header *head = buf;
+       int ret;
+       struct ceph_x_ticket_handler *th =
+               get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+       if (IS_ERR(th))
+               return PTR_ERR(th);
+
+       ceph_x_validate_tickets(ac, &need);
+
+       dout("build_request want %x have %x need %x\n",
+            ac->want_keys, xi->have_keys, need);
+
+       if (need & CEPH_ENTITY_TYPE_AUTH) {
+               struct ceph_x_authenticate *auth = (void *)(head + 1);
+               void *p = auth + 1;
+               struct ceph_x_challenge_blob tmp;
+               char tmp_enc[40];
+               u64 *u;
+
+               if (p > end)
+                       return -ERANGE;
+
+               dout(" get_auth_session_key\n");
+               head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+               /* encrypt and hash */
+               get_random_bytes(&auth->client_challenge, sizeof(u64));
+               tmp.client_challenge = auth->client_challenge;
+               tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+               ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+                                    tmp_enc, sizeof(tmp_enc));
+               if (ret < 0)
+                       return ret;
+
+               auth->struct_v = 1;
+               auth->key = 0;
+               for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+                       auth->key ^= *(__le64 *)u;
+               dout(" server_challenge %llx client_challenge %llx key %llx\n",
+                    xi->server_challenge, le64_to_cpu(auth->client_challenge),
+                    le64_to_cpu(auth->key));
+
+               /* now encode the old ticket if exists */
+               ret = ceph_x_encode_ticket(th, &p, end);
+               if (ret < 0)
+                       return ret;
+
+               return p - buf;
+       }
+
+       if (need) {
+               void *p = head + 1;
+               struct ceph_x_service_ticket_request *req;
+
+               if (p > end)
+                       return -ERANGE;
+               head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+               ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+               if (ret)
+                       return ret;
+               ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+                                xi->auth_authorizer.buf->vec.iov_len);
+
+               req = p;
+               req->keys = cpu_to_le32(need);
+               p += sizeof(*req);
+               return p - buf;
+       }
+
+       return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+                              void *buf, void *end)
+{
+       struct ceph_x_info *xi = ac->private;
+       struct ceph_x_reply_header *head = buf;
+       struct ceph_x_ticket_handler *th;
+       int len = end - buf;
+       int op;
+       int ret;
+
+       if (result)
+               return result;  /* XXX hmm? */
+
+       if (xi->starting) {
+               /* it's a hello */
+               struct ceph_x_server_challenge *sc = buf;
+
+               if (len != sizeof(*sc))
+                       return -EINVAL;
+               xi->server_challenge = le64_to_cpu(sc->server_challenge);
+               dout("handle_reply got server challenge %llx\n",
+                    xi->server_challenge);
+               xi->starting = false;
+               xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+               return -EAGAIN;
+       }
+
+       op = le16_to_cpu(head->op);
+       result = le32_to_cpu(head->result);
+       dout("handle_reply op %d result %d\n", op, result);
+       switch (op) {
+       case CEPHX_GET_AUTH_SESSION_KEY:
+               /* verify auth key */
+               ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+                                              buf + sizeof(*head), end);
+               break;
+
+       case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+               th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+               if (IS_ERR(th))
+                       return PTR_ERR(th);
+               ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+                                              buf + sizeof(*head), end);
+               break;
+
+       default:
+               return -EINVAL;
+       }
+       if (ret)
+               return ret;
+       if (ac->want_keys == xi->have_keys)
+               return 0;
+       return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+       struct ceph_auth_client *ac, int peer_type,
+       struct ceph_authorizer **a,
+       void **buf, size_t *len,
+       void **reply_buf, size_t *reply_len)
+{
+       struct ceph_x_authorizer *au;
+       struct ceph_x_ticket_handler *th;
+       int ret;
+
+       th = get_ticket_handler(ac, peer_type);
+       if (IS_ERR(th))
+               return PTR_ERR(th);
+
+       au = kzalloc(sizeof(*au), GFP_NOFS);
+       if (!au)
+               return -ENOMEM;
+
+       ret = ceph_x_build_authorizer(ac, th, au);
+       if (ret) {
+               kfree(au);
+               return ret;
+       }
+
+       *a = (struct ceph_authorizer *)au;
+       *buf = au->buf->vec.iov_base;
+       *len = au->buf->vec.iov_len;
+       *reply_buf = au->reply_buf;
+       *reply_len = sizeof(au->reply_buf);
+       return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+                                         struct ceph_authorizer *a, size_t len)
+{
+       struct ceph_x_authorizer *au = (void *)a;
+       struct ceph_x_ticket_handler *th;
+       int ret = 0;
+       struct ceph_x_authorize_reply reply;
+       void *p = au->reply_buf;
+       void *end = p + sizeof(au->reply_buf);
+
+       th = get_ticket_handler(ac, au->service);
+       if (IS_ERR(th))
+               return PTR_ERR(th);
+       ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+       if (ret < 0)
+               return ret;
+       if (ret != sizeof(reply))
+               return -EPERM;
+
+       if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+               ret = -EPERM;
+       else
+               ret = 0;
+       dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+            au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+       return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+                                     struct ceph_authorizer *a)
+{
+       struct ceph_x_authorizer *au = (void *)a;
+
+       ceph_buffer_put(au->buf);
+       kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+
+       dout("reset\n");
+       xi->starting = true;
+       xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi = ac->private;
+       struct rb_node *p;
+
+       dout("ceph_x_destroy %p\n", ac);
+       ceph_crypto_key_destroy(&xi->secret);
+
+       while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+               struct ceph_x_ticket_handler *th =
+                       rb_entry(p, struct ceph_x_ticket_handler, node);
+               remove_ticket_handler(ac, th);
+       }
+
+       if (xi->auth_authorizer.buf)
+               ceph_buffer_put(xi->auth_authorizer.buf);
+
+       kfree(ac->private);
+       ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+                                  int peer_type)
+{
+       struct ceph_x_ticket_handler *th;
+
+       th = get_ticket_handler(ac, peer_type);
+       if (!IS_ERR(th))
+               remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+       .name = "x",
+       .is_authenticated = ceph_x_is_authenticated,
+       .should_authenticate = ceph_x_should_authenticate,
+       .build_request = ceph_x_build_request,
+       .handle_reply = ceph_x_handle_reply,
+       .create_authorizer = ceph_x_create_authorizer,
+       .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+       .destroy_authorizer = ceph_x_destroy_authorizer,
+       .invalidate_authorizer = ceph_x_invalidate_authorizer,
+       .reset =  ceph_x_reset,
+       .destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+       struct ceph_x_info *xi;
+       int ret;
+
+       dout("ceph_x_init %p\n", ac);
+       ret = -ENOMEM;
+       xi = kzalloc(sizeof(*xi), GFP_NOFS);
+       if (!xi)
+               goto out;
+
+       ret = -EINVAL;
+       if (!ac->secret) {
+               pr_err("no secret set (for auth_x protocol)\n");
+               goto out_nomem;
+       }
+
+       ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+       if (ret)
+               goto out_nomem;
+
+       xi->starting = true;
+       xi->ticket_handlers = RB_ROOT;
+
+       ac->protocol = CEPH_AUTH_CEPHX;
+       ac->private = xi;
+       ac->ops = &ceph_x_ops;
+       return 0;
+
+out_nomem:
+       kfree(xi);
+out:
+       return ret;
+}
+
+
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h

new file mode 100644 (file)

index 0000000..e02da7a
--- /dev/null
+++ b/net/ceph/auth_x.h
@@ -0,0 +1,50 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include <linux/ceph/auth.h>
+
+#include "crypto.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+       struct rb_node node;
+       unsigned service;
+
+       struct ceph_crypto_key session_key;
+       struct ceph_timespec validity;
+
+       u64 secret_id;
+       struct ceph_buffer *ticket_blob;
+
+       unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+       struct ceph_buffer *buf;
+       unsigned service;
+       u64 nonce;
+       char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+       struct ceph_crypto_key secret;
+
+       bool starting;
+       u64 server_challenge;
+
+       unsigned have_keys;
+       struct rb_root ticket_handlers;
+
+       struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h

new file mode 100644 (file)

index 0000000..671d305
--- /dev/null
+++ b/net/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+       __u8 struct_v;
+       __le64 secret_id;
+       __le32 blob_len;
+       char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+       __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+       __le16 op;
+       __le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+       __u8 struct_v;
+       __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+       __u8 struct_v;
+       __le64 client_challenge;
+       __le64 key;
+       /* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+       __u8 struct_v;
+       __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+       __le64 server_challenge;
+       __le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+       __u8 struct_v;
+       __le64 global_id;
+       __le32 service_id;
+       struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+       __u8 struct_v;
+       __le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+       __u8 struct_v;
+       __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+       __u8 struct_v;
+       __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c

new file mode 100644 (file)

index 0000000..53d8abf
--- /dev/null
+++ b/net/ceph/buffer.c
@@ -0,0 +1,68 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/ceph/buffer.h>
+#include <linux/ceph/decode.h>
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+       struct ceph_buffer *b;
+
+       b = kmalloc(sizeof(*b), gfp);
+       if (!b)
+               return NULL;
+
+       b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+       if (b->vec.iov_base) {
+               b->is_vmalloc = false;
+       } else {
+               b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+               if (!b->vec.iov_base) {
+                       kfree(b);
+                       return NULL;
+               }
+               b->is_vmalloc = true;
+       }
+
+       kref_init(&b->kref);
+       b->alloc_len = len;
+       b->vec.iov_len = len;
+       dout("buffer_new %p\n", b);
+       return b;
+}
+EXPORT_SYMBOL(ceph_buffer_new);
+
+void ceph_buffer_release(struct kref *kref)
+{
+       struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+       dout("buffer_release %p\n", b);
+       if (b->vec.iov_base) {
+               if (b->is_vmalloc)
+                       vfree(b->vec.iov_base);
+               else
+                       kfree(b->vec.iov_base);
+       }
+       kfree(b);
+}
+EXPORT_SYMBOL(ceph_buffer_release);
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+       size_t len;
+
+       ceph_decode_need(p, end, sizeof(u32), bad);
+       len = ceph_decode_32(p);
+       dout("decode_buffer len %d\n", (int)len);
+       ceph_decode_need(p, end, len, bad);
+       *b = ceph_buffer_new(len, GFP_NOFS);
+       if (!*b)
+               return -ENOMEM;
+       ceph_decode_copy(p, (*b)->vec.iov_base, len);
+       return 0;
+bad:
+       return -EINVAL;
+}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c

new file mode 100644 (file)

index 0000000..f3e4a13
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/backing-dev.h>
+#include <linux/ctype.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+       const char *e = s + len;
+
+       while (e != s && *(e-1) != '/')
+               e--;
+       return e;
+}
+EXPORT_SYMBOL(ceph_file_part);
+
+const char *ceph_msg_type_name(int type)
+{
+       switch (type) {
+       case CEPH_MSG_SHUTDOWN: return "shutdown";
+       case CEPH_MSG_PING: return "ping";
+       case CEPH_MSG_AUTH: return "auth";
+       case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+       case CEPH_MSG_MON_MAP: return "mon_map";
+       case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+       case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+       case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+       case CEPH_MSG_STATFS: return "statfs";
+       case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+       case CEPH_MSG_MDS_MAP: return "mds_map";
+       case CEPH_MSG_CLIENT_SESSION: return "client_session";
+       case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+       case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+       case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+       case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+       case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+       case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+       case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+       case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+       case CEPH_MSG_OSD_MAP: return "osd_map";
+       case CEPH_MSG_OSD_OP: return "osd_op";
+       case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+       default: return "unknown";
+       }
+}
+EXPORT_SYMBOL(ceph_msg_type_name);
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+       if (client->have_fsid) {
+               if (ceph_fsid_compare(&client->fsid, fsid)) {
+                       pr_err("bad fsid, had %pU got %pU",
+                              &client->fsid, fsid);
+                       return -1;
+               }
+       } else {
+               pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
+               memcpy(&client->fsid, fsid, sizeof(*fsid));
+               ceph_debugfs_client_init(client);
+               client->have_fsid = true;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_check_fsid);
+
+static int strcmp_null(const char *s1, const char *s2)
+{
+       if (!s1 && !s2)
+               return 0;
+       if (s1 && !s2)
+               return -1;
+       if (!s1 && s2)
+               return 1;
+       return strcmp(s1, s2);
+}
+
+int ceph_compare_options(struct ceph_options *new_opt,
+                        struct ceph_client *client)
+{
+       struct ceph_options *opt1 = new_opt;
+       struct ceph_options *opt2 = client->options;
+       int ofs = offsetof(struct ceph_options, mon_addr);
+       int i;
+       int ret;
+
+       ret = memcmp(opt1, opt2, ofs);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(opt1->name, opt2->name);
+       if (ret)
+               return ret;
+
+       ret = strcmp_null(opt1->secret, opt2->secret);
+       if (ret)
+               return ret;
+
+       /* any matching mon ip implies a match */
+       for (i = 0; i < opt1->num_mon; i++) {
+               if (ceph_monmap_contains(client->monc.monmap,
+                                &opt1->mon_addr[i]))
+                       return 0;
+       }
+       return -1;
+}
+EXPORT_SYMBOL(ceph_compare_options);
+
+
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+       int i = 0;
+       char tmp[3];
+       int err = -EINVAL;
+       int d;
+
+       dout("parse_fsid '%s'\n", str);
+       tmp[2] = 0;
+       while (*str && i < 16) {
+               if (ispunct(*str)) {
+                       str++;
+                       continue;
+               }
+               if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                       break;
+               tmp[0] = str[0];
+               tmp[1] = str[1];
+               if (sscanf(tmp, "%x", &d) < 1)
+                       break;
+               fsid->fsid[i] = d & 0xff;
+               i++;
+               str += 2;
+       }
+
+       if (i == 16)
+               err = 0;
+       dout("parse_fsid ret %d got fsid %pU", err, fsid);
+       return err;
+}
+
+/*
+ * ceph options
+ */
+enum {
+       Opt_osdtimeout,
+       Opt_osdkeepalivetimeout,
+       Opt_mount_timeout,
+       Opt_osd_idle_ttl,
+       Opt_last_int,
+       /* int args above */
+       Opt_fsid,
+       Opt_name,
+       Opt_secret,
+       Opt_ip,
+       Opt_last_string,
+       /* string args above */
+       Opt_noshare,
+       Opt_nocrc,
+};
+
+static match_table_t opt_tokens = {
+       {Opt_osdtimeout, "osdtimeout=%d"},
+       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+       {Opt_mount_timeout, "mount_timeout=%d"},
+       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+       /* int args above */
+       {Opt_fsid, "fsid=%s"},
+       {Opt_name, "name=%s"},
+       {Opt_secret, "secret=%s"},
+       {Opt_ip, "ip=%s"},
+       /* string args above */
+       {Opt_noshare, "noshare"},
+       {Opt_nocrc, "nocrc"},
+       {-1, NULL}
+};
+
+void ceph_destroy_options(struct ceph_options *opt)
+{
+       dout("destroy_options %p\n", opt);
+       kfree(opt->name);
+       kfree(opt->secret);
+       kfree(opt);
+}
+EXPORT_SYMBOL(ceph_destroy_options);
+
+int ceph_parse_options(struct ceph_options **popt, char *options,
+                      const char *dev_name, const char *dev_name_end,
+                      int (*parse_extra_token)(char *c, void *private),
+                      void *private)
+{
+       struct ceph_options *opt;
+       const char *c;
+       int err = -ENOMEM;
+       substring_t argstr[MAX_OPT_ARGS];
+
+       opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+       if (!opt)
+               return err;
+       opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+                               GFP_KERNEL);
+       if (!opt->mon_addr)
+               goto out;
+
+       dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
+            dev_name);
+
+       /* start with defaults */
+       opt->flags = CEPH_OPT_DEFAULT;
+       opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+       opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+
+       /* get mon ip(s) */
+       /* ip1[:port1][,ip2[:port2]...] */
+       err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
+                            CEPH_MAX_MON, &opt->num_mon);
+       if (err < 0)
+               goto out;
+
+       /* parse mount options */
+       while ((c = strsep(&options, ",")) != NULL) {
+               int token, intval, ret;
+               if (!*c)
+                       continue;
+               err = -EINVAL;
+               token = match_token((char *)c, opt_tokens, argstr);
+               if (token < 0 && parse_extra_token) {
+                       /* extra? */
+                       err = parse_extra_token((char *)c, private);
+                       if (err < 0) {
+                               pr_err("bad option at '%s'\n", c);
+                               goto out;
+                       }
+                       continue;
+               }
+               if (token < Opt_last_int) {
+                       ret = match_int(&argstr[0], &intval);
+                       if (ret < 0) {
+                               pr_err("bad mount option arg (not int) "
+                                      "at '%s'\n", c);
+                               continue;
+                       }
+                       dout("got int token %d val %d\n", token, intval);
+               } else if (token > Opt_last_int && token < Opt_last_string) {
+                       dout("got string token %d val %s\n", token,
+                            argstr[0].from);
+               } else {
+                       dout("got token %d\n", token);
+               }
+               switch (token) {
+               case Opt_ip:
+                       err = ceph_parse_ips(argstr[0].from,
+                                            argstr[0].to,
+                                            &opt->my_addr,
+                                            1, NULL);
+                       if (err < 0)
+                               goto out;
+                       opt->flags |= CEPH_OPT_MYIP;
+                       break;
+
+               case Opt_fsid:
+                       err = parse_fsid(argstr[0].from, &opt->fsid);
+                       if (err == 0)
+                               opt->flags |= CEPH_OPT_FSID;
+                       break;
+               case Opt_name:
+                       opt->name = kstrndup(argstr[0].from,
+                                             argstr[0].to-argstr[0].from,
+                                             GFP_KERNEL);
+                       break;
+               case Opt_secret:
+                       opt->secret = kstrndup(argstr[0].from,
+                                               argstr[0].to-argstr[0].from,
+                                               GFP_KERNEL);
+                       break;
+
+                       /* misc */
+               case Opt_osdtimeout:
+                       opt->osd_timeout = intval;
+                       break;
+               case Opt_osdkeepalivetimeout:
+                       opt->osd_keepalive_timeout = intval;
+                       break;
+               case Opt_osd_idle_ttl:
+                       opt->osd_idle_ttl = intval;
+                       break;
+               case Opt_mount_timeout:
+                       opt->mount_timeout = intval;
+                       break;
+
+               case Opt_noshare:
+                       opt->flags |= CEPH_OPT_NOSHARE;
+                       break;
+
+               case Opt_nocrc:
+                       opt->flags |= CEPH_OPT_NOCRC;
+                       break;
+
+               default:
+                       BUG_ON(token);
+               }
+       }
+
+       /* success */
+       *popt = opt;
+       return 0;
+
+out:
+       ceph_destroy_options(opt);
+       return err;
+}
+EXPORT_SYMBOL(ceph_parse_options);
+
+u64 ceph_client_id(struct ceph_client *client)
+{
+       return client->monc.auth->global_id;
+}
+EXPORT_SYMBOL(ceph_client_id);
+
+/*
+ * create a fresh client instance
+ */
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
+{
+       struct ceph_client *client;
+       int err = -ENOMEM;
+
+       client = kzalloc(sizeof(*client), GFP_KERNEL);
+       if (client == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       client->private = private;
+       client->options = opt;
+
+       mutex_init(&client->mount_mutex);
+       init_waitqueue_head(&client->auth_wq);
+       client->auth_err = 0;
+
+       client->extra_mon_dispatch = NULL;
+       client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
+       client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
+
+       client->msgr = NULL;
+
+       /* subsystems */
+       err = ceph_monc_init(&client->monc, client);
+       if (err < 0)
+               goto fail;
+       err = ceph_osdc_init(&client->osdc, client);
+       if (err < 0)
+               goto fail_monc;
+
+       return client;
+
+fail_monc:
+       ceph_monc_stop(&client->monc);
+fail:
+       kfree(client);
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ceph_create_client);
+
+void ceph_destroy_client(struct ceph_client *client)
+{
+       dout("destroy_client %p\n", client);
+
+       /* unmount */
+       ceph_osdc_stop(&client->osdc);
+
+       /*
+        * make sure mds and osd connections close out before destroying
+        * the auth module, which is needed to free those connections'
+        * ceph_authorizers.
+        */
+       ceph_msgr_flush();
+
+       ceph_monc_stop(&client->monc);
+
+       ceph_debugfs_client_cleanup(client);
+
+       if (client->msgr)
+               ceph_messenger_destroy(client->msgr);
+
+       ceph_destroy_options(client->options);
+
+       kfree(client);
+       dout("destroy_client %p done\n", client);
+}
+EXPORT_SYMBOL(ceph_destroy_client);
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_and_osd_map(struct ceph_client *client)
+{
+       return client->monc.monmap && client->monc.monmap->epoch &&
+              client->osdc.osdmap && client->osdc.osdmap->epoch;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+int __ceph_open_session(struct ceph_client *client, unsigned long started)
+{
+       struct ceph_entity_addr *myaddr = NULL;
+       int err;
+       unsigned long timeout = client->options->mount_timeout * HZ;
+
+       /* initialize the messenger */
+       if (client->msgr == NULL) {
+               if (ceph_test_opt(client, MYIP))
+                       myaddr = &client->options->my_addr;
+               client->msgr = ceph_messenger_create(myaddr,
+                                       client->supported_features,
+                                       client->required_features);
+               if (IS_ERR(client->msgr)) {
+                       client->msgr = NULL;
+                       return PTR_ERR(client->msgr);
+               }
+               client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+       }
+
+       /* open session, and wait for mon and osd maps */
+       err = ceph_monc_open_session(&client->monc);
+       if (err < 0)
+               return err;
+
+       while (!have_mon_and_osd_map(client)) {
+               err = -EIO;
+               if (timeout && time_after_eq(jiffies, started + timeout))
+                       return err;
+
+               /* wait */
+               dout("mount waiting for mon_map\n");
+               err = wait_event_interruptible_timeout(client->auth_wq,
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
+                       timeout);
+               if (err == -EINTR || err == -ERESTARTSYS)
+                       return err;
+               if (client->auth_err < 0)
+                       return client->auth_err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(__ceph_open_session);
+
+
+int ceph_open_session(struct ceph_client *client)
+{
+       int ret;
+       unsigned long started = jiffies;  /* note the start time */
+
+       dout("open_session start\n");
+       mutex_lock(&client->mount_mutex);
+
+       ret = __ceph_open_session(client, started);
+
+       mutex_unlock(&client->mount_mutex);
+       return ret;
+}
+EXPORT_SYMBOL(ceph_open_session);
+
+
+static int __init init_ceph_lib(void)
+{
+       int ret = 0;
+
+       ret = ceph_debugfs_init();
+       if (ret < 0)
+               goto out;
+
+       ret = ceph_msgr_init();
+       if (ret < 0)
+               goto out_debugfs;
+
+       pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
+               CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+               CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+               CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+
+       return 0;
+
+out_debugfs:
+       ceph_debugfs_cleanup();
+out:
+       return ret;
+}
+
+static void __exit exit_ceph_lib(void)
+{
+       dout("exit_ceph_lib\n");
+       ceph_msgr_exit();
+       ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph_lib);
+module_exit(exit_ceph_lib);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c

new file mode 100644 (file)

index 0000000..a3a3a31
--- /dev/null
+++ b/net/ceph/ceph_fs.c
@@ -0,0 +1,75 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+       __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+       __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+       __u32 os = le32_to_cpu(layout->fl_object_size);
+
+       /* stripe unit, object size must be non-zero, 64k increment */
+       if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+               return 0;
+       if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+               return 0;
+       /* object size must be a multiple of stripe unit */
+       if (os < su || os % su)
+               return 0;
+       /* stripe count must be non-zero */
+       if (!sc)
+               return 0;
+       return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+       int mode;
+
+#ifdef O_DIRECTORY  /* fixme */
+       if ((flags & O_DIRECTORY) == O_DIRECTORY)
+               return CEPH_FILE_MODE_PIN;
+#endif
+       if ((flags & O_APPEND) == O_APPEND)
+               flags |= O_WRONLY;
+
+       if ((flags & O_ACCMODE) == O_RDWR)
+               mode = CEPH_FILE_MODE_RDWR;
+       else if ((flags & O_ACCMODE) == O_WRONLY)
+               mode = CEPH_FILE_MODE_WR;
+       else
+               mode = CEPH_FILE_MODE_RD;
+
+#ifdef O_LAZY
+       if (flags & O_LAZY)
+               mode |= CEPH_FILE_MODE_LAZY;
+#endif
+
+       return mode;
+}
+EXPORT_SYMBOL(ceph_flags_to_mode);
+
+int ceph_caps_for_mode(int mode)
+{
+       int caps = CEPH_CAP_PIN;
+
+       if (mode & CEPH_FILE_MODE_RD)
+               caps |= CEPH_CAP_FILE_SHARED |
+                       CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+       if (mode & CEPH_FILE_MODE_WR)
+               caps |= CEPH_CAP_FILE_EXCL |
+                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+                       CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+                       CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+       if (mode & CEPH_FILE_MODE_LAZY)
+               caps |= CEPH_CAP_FILE_LAZYIO;
+
+       return caps;
+}
+EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c

new file mode 100644 (file)

index 0000000..815ef88
--- /dev/null
+++ b/net/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include <linux/ceph/types.h>
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)                                           \
+       do {                                                    \
+               a = a - b;  a = a - c;  a = a ^ (c >> 13);      \
+               b = b - c;  b = b - a;  b = b ^ (a << 8);       \
+               c = c - a;  c = c - b;  c = c ^ (b >> 13);      \
+               a = a - b;  a = a - c;  a = a ^ (c >> 12);      \
+               b = b - c;  b = b - a;  b = b ^ (a << 16);      \
+               c = c - a;  c = c - b;  c = c ^ (b >> 5);       \
+               a = a - b;  a = a - c;  a = a ^ (c >> 3);       \
+               b = b - c;  b = b - a;  b = b ^ (a << 10);      \
+               c = c - a;  c = c - b;  c = c ^ (b >> 15);      \
+       } while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+       const unsigned char *k = (const unsigned char *)str;
+       __u32 a, b, c;  /* the internal state */
+       __u32 len;      /* how many key bytes still need mixing */
+
+       /* Set up the internal state */
+       len = length;
+       a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+       b = a;
+       c = 0;               /* variable initialization of internal state */
+
+       /* handle most of the key */
+       while (len >= 12) {
+               a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+                        ((__u32)k[3] << 24));
+               b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+                        ((__u32)k[7] << 24));
+               c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+                        ((__u32)k[11] << 24));
+               mix(a, b, c);
+               k = k + 12;
+               len = len - 12;
+       }
+
+       /* handle the last 11 bytes */
+       c = c + length;
+       switch (len) {            /* all the case statements fall through */
+       case 11:
+               c = c + ((__u32)k[10] << 24);
+       case 10:
+               c = c + ((__u32)k[9] << 16);
+       case 9:
+               c = c + ((__u32)k[8] << 8);
+               /* the first byte of c is reserved for the length */
+       case 8:
+               b = b + ((__u32)k[7] << 24);
+       case 7:
+               b = b + ((__u32)k[6] << 16);
+       case 6:
+               b = b + ((__u32)k[5] << 8);
+       case 5:
+               b = b + k[4];
+       case 4:
+               a = a + ((__u32)k[3] << 24);
+       case 3:
+               a = a + ((__u32)k[2] << 16);
+       case 2:
+               a = a + ((__u32)k[1] << 8);
+       case 1:
+               a = a + k[0];
+               /* case 0: nothing left to add */
+       }
+       mix(a, b, c);
+
+       return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+       unsigned long hash = 0;
+       unsigned char c;
+
+       while (length--) {
+               c = *str++;
+               hash = (hash + (c << 4) + (c >> 4)) * 11;
+       }
+       return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+       switch (type) {
+       case CEPH_STR_HASH_LINUX:
+               return ceph_str_hash_linux(s, len);
+       case CEPH_STR_HASH_RJENKINS:
+               return ceph_str_hash_rjenkins(s, len);
+       default:
+               return -1;
+       }
+}
+
+const char *ceph_str_hash_name(int type)
+{
+       switch (type) {
+       case CEPH_STR_HASH_LINUX:
+               return "linux";
+       case CEPH_STR_HASH_RJENKINS:
+               return "rjenkins";
+       default:
+               return "unknown";
+       }
+}
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c

new file mode 100644 (file)

index 0000000..3fbda04
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
+/*
+ * Ceph string constants
+ */
+#include <linux/module.h>
+#include <linux/ceph/types.h>
+
+const char *ceph_entity_type_name(int type)
+{
+       switch (type) {
+       case CEPH_ENTITY_TYPE_MDS: return "mds";
+       case CEPH_ENTITY_TYPE_OSD: return "osd";
+       case CEPH_ENTITY_TYPE_MON: return "mon";
+       case CEPH_ENTITY_TYPE_CLIENT: return "client";
+       case CEPH_ENTITY_TYPE_AUTH: return "auth";
+       default: return "unknown";
+       }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+       switch (op) {
+       case CEPH_OSD_OP_READ: return "read";
+       case CEPH_OSD_OP_STAT: return "stat";
+
+       case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+       case CEPH_OSD_OP_WRITE: return "write";
+       case CEPH_OSD_OP_DELETE: return "delete";
+       case CEPH_OSD_OP_TRUNCATE: return "truncate";
+       case CEPH_OSD_OP_ZERO: return "zero";
+       case CEPH_OSD_OP_WRITEFULL: return "writefull";
+       case CEPH_OSD_OP_ROLLBACK: return "rollback";
+
+       case CEPH_OSD_OP_APPEND: return "append";
+       case CEPH_OSD_OP_STARTSYNC: return "startsync";
+       case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+       case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+       case CEPH_OSD_OP_TMAPUP: return "tmapup";
+       case CEPH_OSD_OP_TMAPGET: return "tmapget";
+       case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+       case CEPH_OSD_OP_GETXATTR: return "getxattr";
+       case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+       case CEPH_OSD_OP_SETXATTR: return "setxattr";
+       case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+       case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+       case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+       case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
+
+       case CEPH_OSD_OP_PULL: return "pull";
+       case CEPH_OSD_OP_PUSH: return "push";
+       case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+       case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+       case CEPH_OSD_OP_SCRUB: return "scrub";
+
+       case CEPH_OSD_OP_WRLOCK: return "wrlock";
+       case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+       case CEPH_OSD_OP_RDLOCK: return "rdlock";
+       case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+       case CEPH_OSD_OP_UPLOCK: return "uplock";
+       case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+       case CEPH_OSD_OP_CALL: return "call";
+
+       case CEPH_OSD_OP_PGLS: return "pgls";
+       }
+       return "???";
+}
+
+
+const char *ceph_pool_op_name(int op)
+{
+       switch (op) {
+       case POOL_OP_CREATE: return "create";
+       case POOL_OP_DELETE: return "delete";
+       case POOL_OP_AUID_CHANGE: return "auid change";
+       case POOL_OP_CREATE_SNAP: return "create snap";
+       case POOL_OP_DELETE_SNAP: return "delete snap";
+       case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+       case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+       }
+       return "???";
+}
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c

new file mode 100644 (file)

index 0000000..d6ebb13
--- /dev/null
+++ b/net/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include <linux/crush/crush.h>
+
+const char *crush_bucket_alg_name(int alg)
+{
+       switch (alg) {
+       case CRUSH_BUCKET_UNIFORM: return "uniform";
+       case CRUSH_BUCKET_LIST: return "list";
+       case CRUSH_BUCKET_TREE: return "tree";
+       case CRUSH_BUCKET_STRAW: return "straw";
+       default: return "unknown";
+       }
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+       if (p >= b->size)
+               return 0;
+
+       switch (b->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               return ((struct crush_bucket_uniform *)b)->item_weight;
+       case CRUSH_BUCKET_LIST:
+               return ((struct crush_bucket_list *)b)->item_weights[p];
+       case CRUSH_BUCKET_TREE:
+               if (p & 1)
+                       return ((struct crush_bucket_tree *)b)->node_weights[p];
+               return 0;
+       case CRUSH_BUCKET_STRAW:
+               return ((struct crush_bucket_straw *)b)->item_weights[p];
+       }
+       return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+       int i, b, c;
+
+       for (b = 0; b < map->max_buckets; b++) {
+               if (map->buckets[b] == NULL)
+                       continue;
+               for (i = 0; i < map->buckets[b]->size; i++) {
+                       c = map->buckets[b]->items[i];
+                       BUG_ON(c >= map->max_devices ||
+                              c < -map->max_buckets);
+                       if (c >= 0)
+                               map->device_parents[c] = map->buckets[b]->id;
+                       else
+                               map->bucket_parents[-1-c] = map->buckets[b]->id;
+               }
+       }
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+       kfree(b->item_weights);
+       kfree(b->sum_weights);
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+       kfree(b->node_weights);
+       kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+       kfree(b->straws);
+       kfree(b->item_weights);
+       kfree(b->h.perm);
+       kfree(b->h.items);
+       kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+       switch (b->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+               break;
+       case CRUSH_BUCKET_LIST:
+               crush_destroy_bucket_list((struct crush_bucket_list *)b);
+               break;
+       case CRUSH_BUCKET_TREE:
+               crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+               break;
+       case CRUSH_BUCKET_STRAW:
+               crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+               break;
+       }
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+       int b;
+
+       /* buckets */
+       if (map->buckets) {
+               for (b = 0; b < map->max_buckets; b++) {
+                       if (map->buckets[b] == NULL)
+                               continue;
+                       crush_destroy_bucket(map->buckets[b]);
+               }
+               kfree(map->buckets);
+       }
+
+       /* rules */
+       if (map->rules) {
+               for (b = 0; b < map->max_rules; b++)
+                       kfree(map->rules[b]);
+               kfree(map->rules);
+       }
+
+       kfree(map->bucket_parents);
+       kfree(map->device_parents);
+       kfree(map);
+}
+
+
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c

new file mode 100644 (file)

index 0000000..5bb63e3
--- /dev/null
+++ b/net/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {                    \
+               a = a-b;  a = a-c;  a = a^(c>>13);      \
+               b = b-c;  b = b-a;  b = b^(a<<8);       \
+               c = c-a;  c = c-b;  c = c^(b>>13);      \
+               a = a-b;  a = a-c;  a = a^(c>>12);      \
+               b = b-c;  b = b-a;  b = b^(a<<16);      \
+               c = c-a;  c = c-b;  c = c^(b>>5);       \
+               a = a-b;  a = a-c;  a = a^(c>>3);       \
+               b = b-c;  b = b-a;  b = b^(a<<10);      \
+               c = c-a;  c = c-b;  c = c^(b>>15);      \
+       } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+       __u32 hash = crush_hash_seed ^ a;
+       __u32 b = a;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, a, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(x, a, hash);
+       crush_hashmix(b, y, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, x, hash);
+       crush_hashmix(y, a, hash);
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, c, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, d, hash);
+       crush_hashmix(a, x, hash);
+       crush_hashmix(y, b, hash);
+       crush_hashmix(c, x, hash);
+       crush_hashmix(y, d, hash);
+       return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+                                     __u32 e)
+{
+       __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+       __u32 x = 231232;
+       __u32 y = 1232;
+       crush_hashmix(a, b, hash);
+       crush_hashmix(c, d, hash);
+       crush_hashmix(e, x, hash);
+       crush_hashmix(y, a, hash);
+       crush_hashmix(b, x, hash);
+       crush_hashmix(y, c, hash);
+       crush_hashmix(d, x, hash);
+       crush_hashmix(y, e, hash);
+       return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1(a);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_2(a, b);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_3(a, b, c);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_4(a, b, c, d);
+       default:
+               return 0;
+       }
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return crush_hash32_rjenkins1_5(a, b, c, d, e);
+       default:
+               return 0;
+       }
+}
+
+const char *crush_hash_name(int type)
+{
+       switch (type) {
+       case CRUSH_HASH_RJENKINS1:
+               return "rjenkins1";
+       default:
+               return "unknown";
+       }
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c

new file mode 100644 (file)

index 0000000..42599e3
--- /dev/null
+++ b/net/ceph/crush/mapper.c
@@ -0,0 +1,609 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include <linux/crush/crush.h>
+#include <linux/crush/hash.h>
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+       int i;
+
+       for (i = 0; i < map->max_rules; i++) {
+               if (map->rules[i] &&
+                   map->rules[i]->mask.ruleset == ruleset &&
+                   map->rules[i]->mask.type == type &&
+                   map->rules[i]->mask.min_size <= size &&
+                   map->rules[i]->mask.max_size >= size)
+                       return i;
+       }
+       return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+                             int x, int r)
+{
+       unsigned pr = r % bucket->size;
+       unsigned i, s;
+
+       /* start a new permutation if @x has changed */
+       if (bucket->perm_x != x || bucket->perm_n == 0) {
+               dprintk("bucket %d new x=%d\n", bucket->id, x);
+               bucket->perm_x = x;
+
+               /* optimize common r=0 case */
+               if (pr == 0) {
+                       s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+                               bucket->size;
+                       bucket->perm[0] = s;
+                       bucket->perm_n = 0xffff;   /* magic value, see below */
+                       goto out;
+               }
+
+               for (i = 0; i < bucket->size; i++)
+                       bucket->perm[i] = i;
+               bucket->perm_n = 0;
+       } else if (bucket->perm_n == 0xffff) {
+               /* clean up after the r=0 case above */
+               for (i = 1; i < bucket->size; i++)
+                       bucket->perm[i] = i;
+               bucket->perm[bucket->perm[0]] = 0;
+               bucket->perm_n = 1;
+       }
+
+       /* calculate permutation up to pr */
+       for (i = 0; i < bucket->perm_n; i++)
+               dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+       while (bucket->perm_n <= pr) {
+               unsigned p = bucket->perm_n;
+               /* no point in swapping the final entry */
+               if (p < bucket->size - 1) {
+                       i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+                               (bucket->size - p);
+                       if (i) {
+                               unsigned t = bucket->perm[p + i];
+                               bucket->perm[p + i] = bucket->perm[p];
+                               bucket->perm[p] = t;
+                       }
+                       dprintk(" perm_choose swap %d with %d\n", p, p+i);
+               }
+               bucket->perm_n++;
+       }
+       for (i = 0; i < bucket->size; i++)
+               dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+       s = bucket->perm[pr];
+out:
+       dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+               bucket->size, x, r, pr, s);
+       return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+                                int x, int r)
+{
+       return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+                             int x, int r)
+{
+       int i;
+
+       for (i = bucket->h.size-1; i >= 0; i--) {
+               __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+                                        r, bucket->h.id);
+               w &= 0xffff;
+               dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+                       "sw %x rand %llx",
+                       i, x, r, bucket->h.items[i], bucket->item_weights[i],
+                       bucket->sum_weights[i], w);
+               w *= bucket->sum_weights[i];
+               w = w >> 16;
+               /*dprintk(" scaled %llx\n", w);*/
+               if (w < bucket->item_weights[i])
+                       return bucket->h.items[i];
+       }
+
+       BUG_ON(1);
+       return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+       int h = 0;
+       while ((n & 1) == 0) {
+               h++;
+               n = n >> 1;
+       }
+       return h;
+}
+
+static int left(int x)
+{
+       int h = height(x);
+       return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+       int h = height(x);
+       return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+       return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+                             int x, int r)
+{
+       int n, l;
+       __u32 w;
+       __u64 t;
+
+       /* start at root */
+       n = bucket->num_nodes >> 1;
+
+       while (!terminal(n)) {
+               /* pick point in [0, w) */
+               w = bucket->node_weights[n];
+               t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+                                         bucket->h.id) * (__u64)w;
+               t = t >> 32;
+
+               /* descend to the left or right? */
+               l = left(n);
+               if (t < bucket->node_weights[l])
+                       n = l;
+               else
+                       n = right(n);
+       }
+
+       return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+                              int x, int r)
+{
+       int i;
+       int high = 0;
+       __u64 high_draw = 0;
+       __u64 draw;
+
+       for (i = 0; i < bucket->h.size; i++) {
+               draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+               draw &= 0xffff;
+               draw *= bucket->straws[i];
+               if (i == 0 || draw > high_draw) {
+                       high = i;
+                       high_draw = draw;
+               }
+       }
+       return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+       dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+       switch (in->alg) {
+       case CRUSH_BUCKET_UNIFORM:
+               return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+                                         x, r);
+       case CRUSH_BUCKET_LIST:
+               return bucket_list_choose((struct crush_bucket_list *)in,
+                                         x, r);
+       case CRUSH_BUCKET_TREE:
+               return bucket_tree_choose((struct crush_bucket_tree *)in,
+                                         x, r);
+       case CRUSH_BUCKET_STRAW:
+               return bucket_straw_choose((struct crush_bucket_straw *)in,
+                                          x, r);
+       default:
+               BUG_ON(1);
+               return in->items[0];
+       }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+       if (weight[item] >= 0x10000)
+               return 0;
+       if (weight[item] == 0)
+               return 1;
+       if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+           < weight[item])
+               return 0;
+       return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+                       struct crush_bucket *bucket,
+                       __u32 *weight,
+                       int x, int numrep, int type,
+                       int *out, int outpos,
+                       int firstn, int recurse_to_leaf,
+                       int *out2)
+{
+       int rep;
+       int ftotal, flocal;
+       int retry_descent, retry_bucket, skip_rep;
+       struct crush_bucket *in = bucket;
+       int r;
+       int i;
+       int item = 0;
+       int itemtype;
+       int collide, reject;
+       const int orig_tries = 5; /* attempts before we fall back to search */
+
+       dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+               bucket->id, x, outpos, numrep);
+
+       for (rep = outpos; rep < numrep; rep++) {
+               /* keep trying until we get a non-out, non-colliding item */
+               ftotal = 0;
+               skip_rep = 0;
+               do {
+                       retry_descent = 0;
+                       in = bucket;               /* initial bucket */
+
+                       /* choose through intervening buckets */
+                       flocal = 0;
+                       do {
+                               collide = 0;
+                               retry_bucket = 0;
+                               r = rep;
+                               if (in->alg == CRUSH_BUCKET_UNIFORM) {
+                                       /* be careful */
+                                       if (firstn || numrep >= in->size)
+                                               /* r' = r + f_total */
+                                               r += ftotal;
+                                       else if (in->size % numrep == 0)
+                                               /* r'=r+(n+1)*f_local */
+                                               r += (numrep+1) *
+                                                       (flocal+ftotal);
+                                       else
+                                               /* r' = r + n*f_local */
+                                               r += numrep * (flocal+ftotal);
+                               } else {
+                                       if (firstn)
+                                               /* r' = r + f_total */
+                                               r += ftotal;
+                                       else
+                                               /* r' = r + n*f_local */
+                                               r += numrep * (flocal+ftotal);
+                               }
+
+                               /* bucket choose */
+                               if (in->size == 0) {
+                                       reject = 1;
+                                       goto reject;
+                               }
+                               if (flocal >= (in->size>>1) &&
+                                   flocal > orig_tries)
+                                       item = bucket_perm_choose(in, x, r);
+                               else
+                                       item = crush_bucket_choose(in, x, r);
+                               BUG_ON(item >= map->max_devices);
+
+                               /* desired type? */
+                               if (item < 0)
+                                       itemtype = map->buckets[-1-item]->type;
+                               else
+                                       itemtype = 0;
+                               dprintk("  item %d type %d\n", item, itemtype);
+
+                               /* keep going? */
+                               if (itemtype != type) {
+                                       BUG_ON(item >= 0 ||
+                                              (-1-item) >= map->max_buckets);
+                                       in = map->buckets[-1-item];
+                                       retry_bucket = 1;
+                                       continue;
+                               }
+
+                               /* collision? */
+                               for (i = 0; i < outpos; i++) {
+                                       if (out[i] == item) {
+                                               collide = 1;
+                                               break;
+                                       }
+                               }
+
+                               reject = 0;
+                               if (recurse_to_leaf) {
+                                       if (item < 0) {
+                                               if (crush_choose(map,
+                                                        map->buckets[-1-item],
+                                                        weight,
+                                                        x, outpos+1, 0,
+                                                        out2, outpos,
+                                                        firstn, 0,
+                                                        NULL) <= outpos)
+                                                       /* didn't get leaf */
+                                                       reject = 1;
+                                       } else {
+                                               /* we already have a leaf! */
+                                               out2[outpos] = item;
+                                       }
+                               }
+
+                               if (!reject) {
+                                       /* out? */
+                                       if (itemtype == 0)
+                                               reject = is_out(map, weight,
+                                                               item, x);
+                                       else
+                                               reject = 0;
+                               }
+
+reject:
+                               if (reject || collide) {
+                                       ftotal++;
+                                       flocal++;
+
+                                       if (collide && flocal < 3)
+                                               /* retry locally a few times */
+                                               retry_bucket = 1;
+                                       else if (flocal < in->size + orig_tries)
+                                               /* exhaustive bucket search */
+                                               retry_bucket = 1;
+                                       else if (ftotal < 20)
+                                               /* then retry descent */
+                                               retry_descent = 1;
+                                       else
+                                               /* else give up */
+                                               skip_rep = 1;
+                                       dprintk("  reject %d  collide %d  "
+                                               "ftotal %d  flocal %d\n",
+                                               reject, collide, ftotal,
+                                               flocal);
+                               }
+                       } while (retry_bucket);
+               } while (retry_descent);
+
+               if (skip_rep) {
+                       dprintk("skip rep\n");
+                       continue;
+               }
+
+               dprintk("CHOOSE got %d\n", item);
+               out[outpos] = item;
+               outpos++;
+       }
+
+       dprintk("CHOOSE returns %d\n", outpos);
+       return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+                 int ruleno, int x, int *result, int result_max,
+                 int force, __u32 *weight)
+{
+       int result_len;
+       int force_context[CRUSH_MAX_DEPTH];
+       int force_pos = -1;
+       int a[CRUSH_MAX_SET];
+       int b[CRUSH_MAX_SET];
+       int c[CRUSH_MAX_SET];
+       int recurse_to_leaf;
+       int *w;
+       int wsize = 0;
+       int *o;
+       int osize;
+       int *tmp;
+       struct crush_rule *rule;
+       int step;
+       int i, j;
+       int numrep;
+       int firstn;
+       int rc = -1;
+
+       BUG_ON(ruleno >= map->max_rules);
+
+       rule = map->rules[ruleno];
+       result_len = 0;
+       w = a;
+       o = b;
+
+       /*
+        * determine hierarchical context of force, if any.  note
+        * that this may or may not correspond to the specific types
+        * referenced by the crush rule.
+        */
+       if (force >= 0) {
+               if (force >= map->max_devices ||
+                   map->device_parents[force] == 0) {
+                       /*dprintk("CRUSH: forcefed device dne\n");*/
+                       rc = -1;  /* force fed device dne */
+                       goto out;
+               }
+               if (!is_out(map, weight, force, x)) {
+                       while (1) {
+                               force_context[++force_pos] = force;
+                               if (force >= 0)
+                                       force = map->device_parents[force];
+                               else
+                                       force = map->bucket_parents[-1-force];
+                               if (force == 0)
+                                       break;
+                       }
+               }
+       }
+
+       for (step = 0; step < rule->len; step++) {
+               firstn = 0;
+               switch (rule->steps[step].op) {
+               case CRUSH_RULE_TAKE:
+                       w[0] = rule->steps[step].arg1;
+                       if (force_pos >= 0) {
+                               BUG_ON(force_context[force_pos] != w[0]);
+                               force_pos--;
+                       }
+                       wsize = 1;
+                       break;
+
+               case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+               case CRUSH_RULE_CHOOSE_FIRSTN:
+                       firstn = 1;
+               case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+               case CRUSH_RULE_CHOOSE_INDEP:
+                       BUG_ON(wsize == 0);
+
+                       recurse_to_leaf =
+                               rule->steps[step].op ==
+                                CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+                               rule->steps[step].op ==
+                               CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+                       /* reset output */
+                       osize = 0;
+
+                       for (i = 0; i < wsize; i++) {
+                               /*
+                                * see CRUSH_N, CRUSH_N_MINUS macros.
+                                * basically, numrep <= 0 means relative to
+                                * the provided result_max
+                                */
+                               numrep = rule->steps[step].arg1;
+                               if (numrep <= 0) {
+                                       numrep += result_max;
+                                       if (numrep <= 0)
+                                               continue;
+                               }
+                               j = 0;
+                               if (osize == 0 && force_pos >= 0) {
+                                       /* skip any intermediate types */
+                                       while (force_pos &&
+                                              force_context[force_pos] < 0 &&
+                                              rule->steps[step].arg2 !=
+                                              map->buckets[-1 -
+                                              force_context[force_pos]]->type)
+                                               force_pos--;
+                                       o[osize] = force_context[force_pos];
+                                       if (recurse_to_leaf)
+                                               c[osize] = force_context[0];
+                                       j++;
+                                       force_pos--;
+                               }
+                               osize += crush_choose(map,
+                                                     map->buckets[-1-w[i]],
+                                                     weight,
+                                                     x, numrep,
+                                                     rule->steps[step].arg2,
+                                                     o+osize, j,
+                                                     firstn,
+                                                     recurse_to_leaf, c+osize);
+                       }
+
+                       if (recurse_to_leaf)
+                               /* copy final _leaf_ values to output set */
+                               memcpy(o, c, osize*sizeof(*o));
+
+                       /* swap t and w arrays */
+                       tmp = o;
+                       o = w;
+                       w = tmp;
+                       wsize = osize;
+                       break;
+
+
+               case CRUSH_RULE_EMIT:
+                       for (i = 0; i < wsize && result_len < result_max; i++) {
+                               result[result_len] = w[i];
+                               result_len++;
+                       }
+                       wsize = 0;
+                       break;
+
+               default:
+                       BUG_ON(1);
+               }
+       }
+       rc = result_len;
+
+out:
+       return rc;
+}
+
+
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c

new file mode 100644 (file)

index 0000000..7b505b0
--- /dev/null
+++ b/net/ceph/crypto.c
@@ -0,0 +1,412 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <crypto/hash.h>
+
+#include <linux/ceph/decode.h>
+#include "crypto.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+       if (*p + sizeof(u16) + sizeof(key->created) +
+           sizeof(u16) + key->len > end)
+               return -ERANGE;
+       ceph_encode_16(p, key->type);
+       ceph_encode_copy(p, &key->created, sizeof(key->created));
+       ceph_encode_16(p, key->len);
+       ceph_encode_copy(p, key->key, key->len);
+       return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+       ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+       key->type = ceph_decode_16(p);
+       ceph_decode_copy(p, &key->created, sizeof(key->created));
+       key->len = ceph_decode_16(p);
+       ceph_decode_need(p, end, key->len, bad);
+       key->key = kmalloc(key->len, GFP_NOFS);
+       if (!key->key)
+               return -ENOMEM;
+       ceph_decode_copy(p, key->key, key->len);
+       return 0;
+
+bad:
+       dout("failed to decode crypto key\n");
+       return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+       int inlen = strlen(inkey);
+       int blen = inlen * 3 / 4;
+       void *buf, *p;
+       int ret;
+
+       dout("crypto_key_unarmor %s\n", inkey);
+       buf = kmalloc(blen, GFP_NOFS);
+       if (!buf)
+               return -ENOMEM;
+       blen = ceph_unarmor(buf, inkey, inkey+inlen);
+       if (blen < 0) {
+               kfree(buf);
+               return blen;
+       }
+
+       p = buf;
+       ret = ceph_crypto_key_decode(key, &p, p + blen);
+       kfree(buf);
+       if (ret)
+               return ret;
+       dout("crypto_key_unarmor key %p type %d len %d\n", key,
+            key->type, key->len);
+       return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+       return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
+
+static int ceph_aes_encrypt(const void *key, int key_len,
+                           void *dst, size_t *dst_len,
+                           const void *src, size_t src_len)
+{
+       struct scatterlist sg_in[2], sg_out[1];
+       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+       struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+       int ret;
+       void *iv;
+       int ivsize;
+       size_t zero_padding = (0x10 - (src_len & 0x0f));
+       char pad[16];
+
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       memset(pad, zero_padding, zero_padding);
+
+       *dst_len = src_len + zero_padding;
+
+       crypto_blkcipher_setkey((void *)tfm, key, key_len);
+       sg_init_table(sg_in, 2);
+       sg_set_buf(&sg_in[0], src, src_len);
+       sg_set_buf(&sg_in[1], pad, zero_padding);
+       sg_init_table(sg_out, 1);
+       sg_set_buf(sg_out, dst, *dst_len);
+       iv = crypto_blkcipher_crt(tfm)->iv;
+       ivsize = crypto_blkcipher_ivsize(tfm);
+
+       memcpy(iv, aes_iv, ivsize);
+       /*
+       print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+                      key, key_len, 1);
+       print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+                       src, src_len, 1);
+       print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+                       pad, zero_padding, 1);
+       */
+       ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                    src_len + zero_padding);
+       crypto_free_blkcipher(tfm);
+       if (ret < 0)
+               pr_err("ceph_aes_crypt failed %d\n", ret);
+       /*
+       print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+                      dst, *dst_len, 1);
+       */
+       return 0;
+}
+
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
+                            size_t *dst_len,
+                            const void *src1, size_t src1_len,
+                            const void *src2, size_t src2_len)
+{
+       struct scatterlist sg_in[3], sg_out[1];
+       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+       struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+       int ret;
+       void *iv;
+       int ivsize;
+       size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+       char pad[16];
+
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       memset(pad, zero_padding, zero_padding);
+
+       *dst_len = src1_len + src2_len + zero_padding;
+
+       crypto_blkcipher_setkey((void *)tfm, key, key_len);
+       sg_init_table(sg_in, 3);
+       sg_set_buf(&sg_in[0], src1, src1_len);
+       sg_set_buf(&sg_in[1], src2, src2_len);
+       sg_set_buf(&sg_in[2], pad, zero_padding);
+       sg_init_table(sg_out, 1);
+       sg_set_buf(sg_out, dst, *dst_len);
+       iv = crypto_blkcipher_crt(tfm)->iv;
+       ivsize = crypto_blkcipher_ivsize(tfm);
+
+       memcpy(iv, aes_iv, ivsize);
+       /*
+       print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+                      key, key_len, 1);
+       print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+                       src1, src1_len, 1);
+       print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+                       src2, src2_len, 1);
+       print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+                       pad, zero_padding, 1);
+       */
+       ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+                                    src1_len + src2_len + zero_padding);
+       crypto_free_blkcipher(tfm);
+       if (ret < 0)
+               pr_err("ceph_aes_crypt2 failed %d\n", ret);
+       /*
+       print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+                      dst, *dst_len, 1);
+       */
+       return 0;
+}
+
+static int ceph_aes_decrypt(const void *key, int key_len,
+                           void *dst, size_t *dst_len,
+                           const void *src, size_t src_len)
+{
+       struct scatterlist sg_in[1], sg_out[2];
+       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+       struct blkcipher_desc desc = { .tfm = tfm };
+       char pad[16];
+       void *iv;
+       int ivsize;
+       int ret;
+       int last_byte;
+
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       crypto_blkcipher_setkey((void *)tfm, key, key_len);
+       sg_init_table(sg_in, 1);
+       sg_init_table(sg_out, 2);
+       sg_set_buf(sg_in, src, src_len);
+       sg_set_buf(&sg_out[0], dst, *dst_len);
+       sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+       iv = crypto_blkcipher_crt(tfm)->iv;
+       ivsize = crypto_blkcipher_ivsize(tfm);
+
+       memcpy(iv, aes_iv, ivsize);
+
+       /*
+       print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+                      key, key_len, 1);
+       print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+                      src, src_len, 1);
+       */
+
+       ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+       crypto_free_blkcipher(tfm);
+       if (ret < 0) {
+               pr_err("ceph_aes_decrypt failed %d\n", ret);
+               return ret;
+       }
+
+       if (src_len <= *dst_len)
+               last_byte = ((char *)dst)[src_len - 1];
+       else
+               last_byte = pad[src_len - *dst_len - 1];
+       if (last_byte <= 16 && src_len >= last_byte) {
+               *dst_len = src_len - last_byte;
+       } else {
+               pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                      last_byte, (int)src_len);
+               return -EPERM;  /* bad padding */
+       }
+       /*
+       print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+                      dst, *dst_len, 1);
+       */
+       return 0;
+}
+
+static int ceph_aes_decrypt2(const void *key, int key_len,
+                            void *dst1, size_t *dst1_len,
+                            void *dst2, size_t *dst2_len,
+                            const void *src, size_t src_len)
+{
+       struct scatterlist sg_in[1], sg_out[3];
+       struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+       struct blkcipher_desc desc = { .tfm = tfm };
+       char pad[16];
+       void *iv;
+       int ivsize;
+       int ret;
+       int last_byte;
+
+       if (IS_ERR(tfm))
+               return PTR_ERR(tfm);
+
+       sg_init_table(sg_in, 1);
+       sg_set_buf(sg_in, src, src_len);
+       sg_init_table(sg_out, 3);
+       sg_set_buf(&sg_out[0], dst1, *dst1_len);
+       sg_set_buf(&sg_out[1], dst2, *dst2_len);
+       sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+       crypto_blkcipher_setkey((void *)tfm, key, key_len);
+       iv = crypto_blkcipher_crt(tfm)->iv;
+       ivsize = crypto_blkcipher_ivsize(tfm);
+
+       memcpy(iv, aes_iv, ivsize);
+
+       /*
+       print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+                      key, key_len, 1);
+       print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+                      src, src_len, 1);
+       */
+
+       ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+       crypto_free_blkcipher(tfm);
+       if (ret < 0) {
+               pr_err("ceph_aes_decrypt failed %d\n", ret);
+               return ret;
+       }
+
+       if (src_len <= *dst1_len)
+               last_byte = ((char *)dst1)[src_len - 1];
+       else if (src_len <= *dst1_len + *dst2_len)
+               last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+       else
+               last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+       if (last_byte <= 16 && src_len >= last_byte) {
+               src_len -= last_byte;
+       } else {
+               pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+                      last_byte, (int)src_len);
+               return -EPERM;  /* bad padding */
+       }
+
+       if (src_len < *dst1_len) {
+               *dst1_len = src_len;
+               *dst2_len = 0;
+       } else {
+               *dst2_len = src_len - *dst1_len;
+       }
+       /*
+       print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+                      dst1, *dst1_len, 1);
+       print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+                      dst2, *dst2_len, 1);
+       */
+
+       return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                const void *src, size_t src_len)
+{
+       switch (secret->type) {
+       case CEPH_CRYPTO_NONE:
+               if (*dst_len < src_len)
+                       return -ERANGE;
+               memcpy(dst, src, src_len);
+               *dst_len = src_len;
+               return 0;
+
+       case CEPH_CRYPTO_AES:
+               return ceph_aes_decrypt(secret->key, secret->len, dst,
+                                       dst_len, src, src_len);
+
+       default:
+               return -EINVAL;
+       }
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+                       void *dst1, size_t *dst1_len,
+                       void *dst2, size_t *dst2_len,
+                       const void *src, size_t src_len)
+{
+       size_t t;
+
+       switch (secret->type) {
+       case CEPH_CRYPTO_NONE:
+               if (*dst1_len + *dst2_len < src_len)
+                       return -ERANGE;
+               t = min(*dst1_len, src_len);
+               memcpy(dst1, src, t);
+               *dst1_len = t;
+               src += t;
+               src_len -= t;
+               if (src_len) {
+                       t = min(*dst2_len, src_len);
+                       memcpy(dst2, src, t);
+                       *dst2_len = t;
+               }
+               return 0;
+
+       case CEPH_CRYPTO_AES:
+               return ceph_aes_decrypt2(secret->key, secret->len,
+                                        dst1, dst1_len, dst2, dst2_len,
+                                        src, src_len);
+
+       default:
+               return -EINVAL;
+       }
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                const void *src, size_t src_len)
+{
+       switch (secret->type) {
+       case CEPH_CRYPTO_NONE:
+               if (*dst_len < src_len)
+                       return -ERANGE;
+               memcpy(dst, src, src_len);
+               *dst_len = src_len;
+               return 0;
+
+       case CEPH_CRYPTO_AES:
+               return ceph_aes_encrypt(secret->key, secret->len, dst,
+                                       dst_len, src, src_len);
+
+       default:
+               return -EINVAL;
+       }
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+                 const void *src1, size_t src1_len,
+                 const void *src2, size_t src2_len)
+{
+       switch (secret->type) {
+       case CEPH_CRYPTO_NONE:
+               if (*dst_len < src1_len + src2_len)
+                       return -ERANGE;
+               memcpy(dst, src1, src1_len);
+               memcpy(dst + src1_len, src2, src2_len);
+               *dst_len = src1_len + src2_len;
+               return 0;
+
+       case CEPH_CRYPTO_AES:
+               return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+                                        src1, src1_len, src2, src2_len);
+
+       default:
+               return -EINVAL;
+       }
+}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h

new file mode 100644 (file)

index 0000000..f9eccac
--- /dev/null
+++ b/net/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+       int type;
+       struct ceph_timespec created;
+       int len;
+       void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+       kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+                                 void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+                                 void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+                       void *dst, size_t *dst_len,
+                       const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+                       void *dst, size_t *dst_len,
+                       const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+                       void *dst1, size_t *dst1_len,
+                       void *dst2, size_t *dst2_len,
+                       const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+                        void *dst, size_t *dst_len,
+                        const void *src1, size_t src1_len,
+                        const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
+
+#endif
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c

new file mode 100644 (file)

index 0000000..27d4ea3
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+       int i;
+       struct ceph_client *client = s->private;
+
+       if (client->monc.monmap == NULL)
+               return 0;
+
+       seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+       for (i = 0; i < client->monc.monmap->num_mon; i++) {
+               struct ceph_entity_inst *inst =
+                       &client->monc.monmap->mon_inst[i];
+
+               seq_printf(s, "\t%s%lld\t%s\n",
+                          ENTITY_NAME(inst->name),
+                          ceph_pr_addr(&inst->addr.in_addr));
+       }
+       return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+       int i;
+       struct ceph_client *client = s->private;
+       struct rb_node *n;
+
+       if (client->osdc.osdmap == NULL)
+               return 0;
+       seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+       seq_printf(s, "flags%s%s\n",
+                  (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+                  " NEARFULL" : "",
+                  (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+                  " FULL" : "");
+       for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+               struct ceph_pg_pool_info *pool =
+                       rb_entry(n, struct ceph_pg_pool_info, node);
+               seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+                          pool->id, pool->v.pg_num, pool->pg_num_mask,
+                          pool->v.lpg_num, pool->lpg_num_mask);
+       }
+       for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+               struct ceph_entity_addr *addr =
+                       &client->osdc.osdmap->osd_addr[i];
+               int state = client->osdc.osdmap->osd_state[i];
+               char sb[64];
+
+               seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+                          i, ceph_pr_addr(&addr->in_addr),
+                          ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+                          ceph_osdmap_state_str(sb, sizeof(sb), state));
+       }
+       return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_client *monc = &client->monc;
+       struct rb_node *rp;
+
+       mutex_lock(&monc->mutex);
+
+       if (monc->have_mdsmap)
+               seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+       if (monc->have_osdmap)
+               seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+       if (monc->want_next_osdmap)
+               seq_printf(s, "want next osdmap\n");
+
+       for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+               __u16 op;
+               req = rb_entry(rp, struct ceph_mon_generic_request, node);
+               op = le16_to_cpu(req->request->hdr.type);
+               if (op == CEPH_MSG_STATFS)
+                       seq_printf(s, "%lld statfs\n", req->tid);
+               else
+                       seq_printf(s, "%lld unknown\n", req->tid);
+       }
+
+       mutex_unlock(&monc->mutex);
+       return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+       struct ceph_client *client = s->private;
+       struct ceph_osd_client *osdc = &client->osdc;
+       struct rb_node *p;
+
+       mutex_lock(&osdc->request_mutex);
+       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+               struct ceph_osd_request *req;
+               struct ceph_osd_request_head *head;
+               struct ceph_osd_op *op;
+               int num_ops;
+               int opcode, olen;
+               int i;
+
+               req = rb_entry(p, struct ceph_osd_request, r_node);
+
+               seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+                          req->r_osd ? req->r_osd->o_osd : -1,
+                          le32_to_cpu(req->r_pgid.pool),
+                          le16_to_cpu(req->r_pgid.ps));
+
+               head = req->r_request->front.iov_base;
+               op = (void *)(head + 1);
+
+               num_ops = le16_to_cpu(head->num_ops);
+               olen = le32_to_cpu(head->object_len);
+               seq_printf(s, "%.*s", olen,
+                          (const char *)(head->ops + num_ops));
+
+               if (req->r_reassert_version.epoch)
+                       seq_printf(s, "\t%u'%llu",
+                          (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+                          le64_to_cpu(req->r_reassert_version.version));
+               else
+                       seq_printf(s, "\t");
+
+               for (i = 0; i < num_ops; i++) {
+                       opcode = le16_to_cpu(op->op);
+                       seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+                       op++;
+               }
+
+               seq_printf(s, "\n");
+       }
+       mutex_unlock(&osdc->request_mutex);
+       return 0;
+}
+
+CEPH_DEFINE_SHOW_FUNC(monmap_show)
+CEPH_DEFINE_SHOW_FUNC(osdmap_show)
+CEPH_DEFINE_SHOW_FUNC(monc_show)
+CEPH_DEFINE_SHOW_FUNC(osdc_show)
+
+int ceph_debugfs_init(void)
+{
+       ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+       if (!ceph_debugfs_dir)
+               return -ENOMEM;
+       return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+       debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+       int ret = -ENOMEM;
+       char name[80];
+
+       snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
+                client->monc.auth->global_id);
+
+       client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+       if (!client->debugfs_dir)
+               goto out;
+
+       client->monc.debugfs_file = debugfs_create_file("monc",
+                                                     0600,
+                                                     client->debugfs_dir,
+                                                     client,
+                                                     &monc_show_fops);
+       if (!client->monc.debugfs_file)
+               goto out;
+
+       client->osdc.debugfs_file = debugfs_create_file("osdc",
+                                                     0600,
+                                                     client->debugfs_dir,
+                                                     client,
+                                                     &osdc_show_fops);
+       if (!client->osdc.debugfs_file)
+               goto out;
+
+       client->debugfs_monmap = debugfs_create_file("monmap",
+                                       0600,
+                                       client->debugfs_dir,
+                                       client,
+                                       &monmap_show_fops);
+       if (!client->debugfs_monmap)
+               goto out;
+
+       client->debugfs_osdmap = debugfs_create_file("osdmap",
+                                       0600,
+                                       client->debugfs_dir,
+                                       client,
+                                       &osdmap_show_fops);
+       if (!client->debugfs_osdmap)
+               goto out;
+
+       return 0;
+
+out:
+       ceph_debugfs_client_cleanup(client);
+       return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+       debugfs_remove(client->debugfs_osdmap);
+       debugfs_remove(client->debugfs_monmap);
+       debugfs_remove(client->osdc.debugfs_file);
+       debugfs_remove(client->monc.debugfs_file);
+       debugfs_remove(client->debugfs_dir);
+}
+
+#else  /* CONFIG_DEBUG_FS */
+
+int ceph_debugfs_init(void)
+{
+       return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+       return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  /* CONFIG_DEBUG_FS */
+
+EXPORT_SYMBOL(ceph_debugfs_init);
+EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c

new file mode 100644 (file)

index 0000000..0e8157e
--- /dev/null
+++ b/net/ceph/messenger.c
@@ -0,0 +1,2453 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/slab.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+{
+       int i;
+       char *s;
+       struct sockaddr_in *in4 = (void *)ss;
+       struct sockaddr_in6 *in6 = (void *)ss;
+
+       spin_lock(&addr_str_lock);
+       i = last_addr_str++;
+       if (last_addr_str == MAX_ADDR_STR)
+               last_addr_str = 0;
+       spin_unlock(&addr_str_lock);
+       s = addr_str[i];
+
+       switch (ss->ss_family) {
+       case AF_INET:
+               snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+                        (unsigned int)ntohs(in4->sin_port));
+               break;
+
+       case AF_INET6:
+               snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+                        (unsigned int)ntohs(in6->sin6_port));
+               break;
+
+       default:
+               sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+       }
+
+       return s;
+}
+EXPORT_SYMBOL(ceph_pr_addr);
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+       memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+       ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int ceph_msgr_init(void)
+{
+       ceph_msgr_wq = create_workqueue("ceph-msgr");
+       if (IS_ERR(ceph_msgr_wq)) {
+               int ret = PTR_ERR(ceph_msgr_wq);
+               pr_err("msgr_init failed to create workqueue: %d\n", ret);
+               ceph_msgr_wq = NULL;
+               return ret;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_msgr_init);
+
+void ceph_msgr_exit(void)
+{
+       destroy_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_exit);
+
+void ceph_msgr_flush(void)
+{
+       flush_workqueue(ceph_msgr_wq);
+}
+EXPORT_SYMBOL(ceph_msgr_flush);
+
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+       struct ceph_connection *con =
+               (struct ceph_connection *)sk->sk_user_data;
+       if (sk->sk_state != TCP_CLOSE_WAIT) {
+               dout("ceph_data_ready on %p state = %lu, queueing work\n",
+                    con, con->state);
+               queue_con(con);
+       }
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+       struct ceph_connection *con =
+               (struct ceph_connection *)sk->sk_user_data;
+
+       /* only queue to workqueue if there is data we want to write. */
+       if (test_bit(WRITE_PENDING, &con->state)) {
+               dout("ceph_write_space %p queueing write work\n", con);
+               queue_con(con);
+       } else {
+               dout("ceph_write_space %p nothing to write\n", con);
+       }
+
+       /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+       clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+       struct ceph_connection *con =
+               (struct ceph_connection *)sk->sk_user_data;
+
+       dout("ceph_state_change %p state = %lu sk_state = %u\n",
+            con, con->state, sk->sk_state);
+
+       if (test_bit(CLOSED, &con->state))
+               return;
+
+       switch (sk->sk_state) {
+       case TCP_CLOSE:
+               dout("ceph_state_change TCP_CLOSE\n");
+       case TCP_CLOSE_WAIT:
+               dout("ceph_state_change TCP_CLOSE_WAIT\n");
+               if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+                       if (test_bit(CONNECTING, &con->state))
+                               con->error_msg = "connection failed";
+                       else
+                               con->error_msg = "socket closed";
+                       queue_con(con);
+               }
+               break;
+       case TCP_ESTABLISHED:
+               dout("ceph_state_change TCP_ESTABLISHED\n");
+               queue_con(con);
+               break;
+       }
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+                              struct ceph_connection *con)
+{
+       struct sock *sk = sock->sk;
+       sk->sk_user_data = (void *)con;
+       sk->sk_data_ready = ceph_data_ready;
+       sk->sk_write_space = ceph_write_space;
+       sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+       struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+       struct socket *sock;
+       int ret;
+
+       BUG_ON(con->sock);
+       ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+                              IPPROTO_TCP, &sock);
+       if (ret)
+               return ERR_PTR(ret);
+       con->sock = sock;
+       sock->sk->sk_allocation = GFP_NOFS;
+
+#ifdef CONFIG_LOCKDEP
+       lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
+       set_sock_callbacks(sock, con);
+
+       dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
+
+       ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+                                O_NONBLOCK);
+       if (ret == -EINPROGRESS) {
+               dout("connect %s EINPROGRESS sk_state = %u\n",
+                    ceph_pr_addr(&con->peer_addr.in_addr),
+                    sock->sk->sk_state);
+               ret = 0;
+       }
+       if (ret < 0) {
+               pr_err("connect %s error %d\n",
+                      ceph_pr_addr(&con->peer_addr.in_addr), ret);
+               sock_release(sock);
+               con->sock = NULL;
+               con->error_msg = "connect error";
+       }
+
+       if (ret < 0)
+               return ERR_PTR(ret);
+       return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+       struct kvec iov = {buf, len};
+       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+       return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+                    size_t kvlen, size_t len, int more)
+{
+       struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+       if (more)
+               msg.msg_flags |= MSG_MORE;
+       else
+               msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+       return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+       int rc;
+
+       dout("con_close_socket on %p sock %p\n", con, con->sock);
+       if (!con->sock)
+               return 0;
+       set_bit(SOCK_CLOSED, &con->state);
+       rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+       sock_release(con->sock);
+       con->sock = NULL;
+       clear_bit(SOCK_CLOSED, &con->state);
+       return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+       list_del_init(&msg->list_head);
+       ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+       while (!list_empty(head)) {
+               struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+                                                       list_head);
+               ceph_msg_remove(msg);
+       }
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+       /* reset connection, out_queue, msg_ and connect_seq */
+       /* discard existing out_queue and msg_seq */
+       ceph_msg_remove_list(&con->out_queue);
+       ceph_msg_remove_list(&con->out_sent);
+
+       if (con->in_msg) {
+               ceph_msg_put(con->in_msg);
+               con->in_msg = NULL;
+       }
+
+       con->connect_seq = 0;
+       con->out_seq = 0;
+       if (con->out_msg) {
+               ceph_msg_put(con->out_msg);
+               con->out_msg = NULL;
+       }
+       con->out_keepalive_pending = false;
+       con->in_seq = 0;
+       con->in_seq_acked = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+       dout("con_close %p peer %s\n", con,
+            ceph_pr_addr(&con->peer_addr.in_addr));
+       set_bit(CLOSED, &con->state);  /* in case there's queued work */
+       clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+       clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+       clear_bit(KEEPALIVE_PENDING, &con->state);
+       clear_bit(WRITE_PENDING, &con->state);
+       mutex_lock(&con->mutex);
+       reset_connection(con);
+       con->peer_global_seq = 0;
+       cancel_delayed_work(&con->work);
+       mutex_unlock(&con->mutex);
+       queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_close);
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+       dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+       set_bit(OPENING, &con->state);
+       clear_bit(CLOSED, &con->state);
+       memcpy(&con->peer_addr, addr, sizeof(*addr));
+       con->delay = 0;      /* reset backoff memory */
+       queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_open);
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+       return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+       dout("con_get %p nref = %d -> %d\n", con,
+            atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+       if (atomic_inc_not_zero(&con->nref))
+               return con;
+       return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+       dout("con_put %p nref = %d -> %d\n", con,
+            atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+       BUG_ON(atomic_read(&con->nref) == 0);
+       if (atomic_dec_and_test(&con->nref)) {
+               BUG_ON(con->sock);
+               kfree(con);
+       }
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+       dout("con_init %p\n", con);
+       memset(con, 0, sizeof(*con));
+       atomic_set(&con->nref, 1);
+       con->msgr = msgr;
+       mutex_init(&con->mutex);
+       INIT_LIST_HEAD(&con->out_queue);
+       INIT_LIST_HEAD(&con->out_sent);
+       INIT_DELAYED_WORK(&con->work, con_work);
+}
+EXPORT_SYMBOL(ceph_con_init);
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+       u32 ret;
+
+       spin_lock(&msgr->global_seq_lock);
+       if (msgr->global_seq < gt)
+               msgr->global_seq = gt;
+       ret = ++msgr->global_seq;
+       spin_unlock(&msgr->global_seq_lock);
+       return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+       struct ceph_msg *m = con->out_msg;
+
+       dout("prepare_write_message_footer %p\n", con);
+       con->out_kvec_is_msg = true;
+       con->out_kvec[v].iov_base = &m->footer;
+       con->out_kvec[v].iov_len = sizeof(m->footer);
+       con->out_kvec_bytes += sizeof(m->footer);
+       con->out_kvec_left++;
+       con->out_more = m->more_to_follow;
+       con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+       struct ceph_msg *m;
+       int v = 0;
+
+       con->out_kvec_bytes = 0;
+       con->out_kvec_is_msg = true;
+       con->out_msg_done = false;
+
+       /* Sneak an ack in there first?  If we can get it into the same
+        * TCP packet that's a good thing. */
+       if (con->in_seq > con->in_seq_acked) {
+               con->in_seq_acked = con->in_seq;
+               con->out_kvec[v].iov_base = &tag_ack;
+               con->out_kvec[v++].iov_len = 1;
+               con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+               con->out_kvec[v].iov_base = &con->out_temp_ack;
+               con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+               con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+       }
+
+       m = list_first_entry(&con->out_queue,
+                      struct ceph_msg, list_head);
+       con->out_msg = m;
+       if (test_bit(LOSSYTX, &con->state)) {
+               list_del_init(&m->list_head);
+       } else {
+               /* put message on sent list */
+               ceph_msg_get(m);
+               list_move_tail(&m->list_head, &con->out_sent);
+       }
+
+       /*
+        * only assign outgoing seq # if we haven't sent this message
+        * yet.  if it is requeued, resend with it's original seq.
+        */
+       if (m->needs_out_seq) {
+               m->hdr.seq = cpu_to_le64(++con->out_seq);
+               m->needs_out_seq = false;
+       }
+
+       dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+            m, con->out_seq, le16_to_cpu(m->hdr.type),
+            le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+            le32_to_cpu(m->hdr.data_len),
+            m->nr_pages);
+       BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+       /* tag + hdr + front + middle */
+       con->out_kvec[v].iov_base = &tag_msg;
+       con->out_kvec[v++].iov_len = 1;
+       con->out_kvec[v].iov_base = &m->hdr;
+       con->out_kvec[v++].iov_len = sizeof(m->hdr);
+       con->out_kvec[v++] = m->front;
+       if (m->middle)
+               con->out_kvec[v++] = m->middle->vec;
+       con->out_kvec_left = v;
+       con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+               (m->middle ? m->middle->vec.iov_len : 0);
+       con->out_kvec_cur = con->out_kvec;
+
+       /* fill in crc (except data pages), footer */
+       con->out_msg->hdr.crc =
+               cpu_to_le32(crc32c(0, (void *)&m->hdr,
+                                     sizeof(m->hdr) - sizeof(m->hdr.crc)));
+       con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+       con->out_msg->footer.front_crc =
+               cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+       if (m->middle)
+               con->out_msg->footer.middle_crc =
+                       cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+                                          m->middle->vec.iov_len));
+       else
+               con->out_msg->footer.middle_crc = 0;
+       con->out_msg->footer.data_crc = 0;
+       dout("prepare_write_message front_crc %u data_crc %u\n",
+            le32_to_cpu(con->out_msg->footer.front_crc),
+            le32_to_cpu(con->out_msg->footer.middle_crc));
+
+       /* is there a data payload? */
+       if (le32_to_cpu(m->hdr.data_len) > 0) {
+               /* initialize page iterator */
+               con->out_msg_pos.page = 0;
+               if (m->pages)
+                       con->out_msg_pos.page_pos =
+                               le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+               else
+                       con->out_msg_pos.page_pos = 0;
+               con->out_msg_pos.data_pos = 0;
+               con->out_msg_pos.did_page_crc = 0;
+               con->out_more = 1;  /* data + footer will follow */
+       } else {
+               /* no, queue up footer too and be done */
+               prepare_write_message_footer(con, v);
+       }
+
+       set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+       dout("prepare_write_ack %p %llu -> %llu\n", con,
+            con->in_seq_acked, con->in_seq);
+       con->in_seq_acked = con->in_seq;
+
+       con->out_kvec[0].iov_base = &tag_ack;
+       con->out_kvec[0].iov_len = 1;
+       con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+       con->out_kvec[1].iov_base = &con->out_temp_ack;
+       con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+       con->out_kvec_left = 2;
+       con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+       con->out_kvec_cur = con->out_kvec;
+       con->out_more = 1;  /* more will follow.. eventually.. */
+       set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+       dout("prepare_write_keepalive %p\n", con);
+       con->out_kvec[0].iov_base = &tag_keepalive;
+       con->out_kvec[0].iov_len = 1;
+       con->out_kvec_left = 1;
+       con->out_kvec_bytes = 1;
+       con->out_kvec_cur = con->out_kvec;
+       set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+       void *auth_buf;
+       int auth_len = 0;
+       int auth_protocol = 0;
+
+       mutex_unlock(&con->mutex);
+       if (con->ops->get_authorizer)
+               con->ops->get_authorizer(con, &auth_buf, &auth_len,
+                                        &auth_protocol, &con->auth_reply_buf,
+                                        &con->auth_reply_buf_len,
+                                        con->auth_retry);
+       mutex_lock(&con->mutex);
+
+       con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+       con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+       con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+       con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+       con->out_kvec_left++;
+       con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+                                struct ceph_connection *con)
+{
+       int len = strlen(CEPH_BANNER);
+
+       con->out_kvec[0].iov_base = CEPH_BANNER;
+       con->out_kvec[0].iov_len = len;
+       con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+       con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+       con->out_kvec_left = 2;
+       con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+       con->out_kvec_cur = con->out_kvec;
+       con->out_more = 0;
+       set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+                                 struct ceph_connection *con,
+                                 int after_banner)
+{
+       unsigned global_seq = get_global_seq(con->msgr, 0);
+       int proto;
+
+       switch (con->peer_name.type) {
+       case CEPH_ENTITY_TYPE_MON:
+               proto = CEPH_MONC_PROTOCOL;
+               break;
+       case CEPH_ENTITY_TYPE_OSD:
+               proto = CEPH_OSDC_PROTOCOL;
+               break;
+       case CEPH_ENTITY_TYPE_MDS:
+               proto = CEPH_MDSC_PROTOCOL;
+               break;
+       default:
+               BUG();
+       }
+
+       dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+            con->connect_seq, global_seq, proto);
+
+       con->out_connect.features = cpu_to_le64(msgr->supported_features);
+       con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+       con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+       con->out_connect.global_seq = cpu_to_le32(global_seq);
+       con->out_connect.protocol_version = cpu_to_le32(proto);
+       con->out_connect.flags = 0;
+
+       if (!after_banner) {
+               con->out_kvec_left = 0;
+               con->out_kvec_bytes = 0;
+       }
+       con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+       con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+       con->out_kvec_left++;
+       con->out_kvec_bytes += sizeof(con->out_connect);
+       con->out_kvec_cur = con->out_kvec;
+       con->out_more = 0;
+       set_bit(WRITE_PENDING, &con->state);
+
+       prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+       int ret;
+
+       dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+       while (con->out_kvec_bytes > 0) {
+               ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+                                      con->out_kvec_left, con->out_kvec_bytes,
+                                      con->out_more);
+               if (ret <= 0)
+                       goto out;
+               con->out_kvec_bytes -= ret;
+               if (con->out_kvec_bytes == 0)
+                       break;            /* done */
+               while (ret > 0) {
+                       if (ret >= con->out_kvec_cur->iov_len) {
+                               ret -= con->out_kvec_cur->iov_len;
+                               con->out_kvec_cur++;
+                               con->out_kvec_left--;
+                       } else {
+                               con->out_kvec_cur->iov_len -= ret;
+                               con->out_kvec_cur->iov_base += ret;
+                               ret = 0;
+                               break;
+                       }
+               }
+       }
+       con->out_kvec_left = 0;
+       con->out_kvec_is_msg = false;
+       ret = 1;
+out:
+       dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+            con->out_kvec_bytes, con->out_kvec_left, ret);
+       return ret;  /* done! */
+}
+
+#ifdef CONFIG_BLOCK
+static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+{
+       if (!bio) {
+               *iter = NULL;
+               *seg = 0;
+               return;
+       }
+       *iter = bio;
+       *seg = bio->bi_idx;
+}
+
+static void iter_bio_next(struct bio **bio_iter, int *seg)
+{
+       if (*bio_iter == NULL)
+               return;
+
+       BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
+
+       (*seg)++;
+       if (*seg == (*bio_iter)->bi_vcnt)
+               init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
+}
+#endif
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+       struct ceph_msg *msg = con->out_msg;
+       unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+       size_t len;
+       int crc = con->msgr->nocrc;
+       int ret;
+       int total_max_write;
+       int in_trail = 0;
+       size_t trail_len = (msg->trail ? msg->trail->length : 0);
+
+       dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+            con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+            con->out_msg_pos.page_pos);
+
+#ifdef CONFIG_BLOCK
+       if (msg->bio && !msg->bio_iter)
+               init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+#endif
+
+       while (data_len > con->out_msg_pos.data_pos) {
+               struct page *page = NULL;
+               void *kaddr = NULL;
+               int max_write = PAGE_SIZE;
+               int page_shift = 0;
+
+               total_max_write = data_len - trail_len -
+                       con->out_msg_pos.data_pos;
+
+               /*
+                * if we are calculating the data crc (the default), we need
+                * to map the page.  if our pages[] has been revoked, use the
+                * zero page.
+                */
+
+               /* have we reached the trail part of the data? */
+               if (con->out_msg_pos.data_pos >= data_len - trail_len) {
+                       in_trail = 1;
+
+                       total_max_write = data_len - con->out_msg_pos.data_pos;
+
+                       page = list_first_entry(&msg->trail->head,
+                                               struct page, lru);
+                       if (crc)
+                               kaddr = kmap(page);
+                       max_write = PAGE_SIZE;
+               } else if (msg->pages) {
+                       page = msg->pages[con->out_msg_pos.page];
+                       if (crc)
+                               kaddr = kmap(page);
+               } else if (msg->pagelist) {
+                       page = list_first_entry(&msg->pagelist->head,
+                                               struct page, lru);
+                       if (crc)
+                               kaddr = kmap(page);
+#ifdef CONFIG_BLOCK
+               } else if (msg->bio) {
+                       struct bio_vec *bv;
+
+                       bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+                       page = bv->bv_page;
+                       page_shift = bv->bv_offset;
+                       if (crc)
+                               kaddr = kmap(page) + page_shift;
+                       max_write = bv->bv_len;
+#endif
+               } else {
+                       page = con->msgr->zero_page;
+                       if (crc)
+                               kaddr = page_address(con->msgr->zero_page);
+               }
+               len = min_t(int, max_write - con->out_msg_pos.page_pos,
+                           total_max_write);
+
+               if (crc && !con->out_msg_pos.did_page_crc) {
+                       void *base = kaddr + con->out_msg_pos.page_pos;
+                       u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+                       BUG_ON(kaddr == NULL);
+                       con->out_msg->footer.data_crc =
+                               cpu_to_le32(crc32c(tmpcrc, base, len));
+                       con->out_msg_pos.did_page_crc = 1;
+               }
+               ret = kernel_sendpage(con->sock, page,
+                                     con->out_msg_pos.page_pos + page_shift,
+                                     len,
+                                     MSG_DONTWAIT | MSG_NOSIGNAL |
+                                     MSG_MORE);
+
+               if (crc &&
+                   (msg->pages || msg->pagelist || msg->bio || in_trail))
+                       kunmap(page);
+
+               if (ret <= 0)
+                       goto out;
+
+               con->out_msg_pos.data_pos += ret;
+               con->out_msg_pos.page_pos += ret;
+               if (ret == len) {
+                       con->out_msg_pos.page_pos = 0;
+                       con->out_msg_pos.page++;
+                       con->out_msg_pos.did_page_crc = 0;
+                       if (in_trail)
+                               list_move_tail(&page->lru,
+                                              &msg->trail->head);
+                       else if (msg->pagelist)
+                               list_move_tail(&page->lru,
+                                              &msg->pagelist->head);
+#ifdef CONFIG_BLOCK
+                       else if (msg->bio)
+                               iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+#endif
+               }
+       }
+
+       dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+       /* prepare and queue up footer, too */
+       if (!crc)
+               con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+       con->out_kvec_bytes = 0;
+       con->out_kvec_left = 0;
+       con->out_kvec_cur = con->out_kvec;
+       prepare_write_message_footer(con, 0);
+       ret = 1;
+out:
+       return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+       int ret;
+
+       while (con->out_skip > 0) {
+               struct kvec iov = {
+                       .iov_base = page_address(con->msgr->zero_page),
+                       .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+               };
+
+               ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+               if (ret <= 0)
+                       goto out;
+               con->out_skip -= ret;
+       }
+       ret = 1;
+out:
+       return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+       dout("prepare_read_banner %p\n", con);
+       con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+       dout("prepare_read_connect %p\n", con);
+       con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+       dout("prepare_read_ack %p\n", con);
+       con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+       dout("prepare_read_tag %p\n", con);
+       con->in_base_pos = 0;
+       con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+       dout("prepare_read_message %p\n", con);
+       BUG_ON(con->in_msg != NULL);
+       con->in_base_pos = 0;
+       con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+       return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+                       int *to, int size, void *object)
+{
+       *to += size;
+       while (con->in_base_pos < *to) {
+               int left = *to - con->in_base_pos;
+               int have = size - left;
+               int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+               if (ret <= 0)
+                       return ret;
+               con->in_base_pos += ret;
+       }
+       return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+       int ret, to = 0;
+
+       dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+       /* peer's banner */
+       ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+       if (ret <= 0)
+               goto out;
+       ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+                          &con->actual_peer_addr);
+       if (ret <= 0)
+               goto out;
+       ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+                          &con->peer_addr_for_me);
+       if (ret <= 0)
+               goto out;
+out:
+       return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+       int ret, to = 0;
+
+       dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+       ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+       if (ret <= 0)
+               goto out;
+       ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+                          con->auth_reply_buf);
+       if (ret <= 0)
+               goto out;
+
+       dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+            con, (int)con->in_reply.tag,
+            le32_to_cpu(con->in_reply.connect_seq),
+            le32_to_cpu(con->in_reply.global_seq));
+out:
+       return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+       if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+               pr_err("connect to %s got bad banner\n",
+                      ceph_pr_addr(&con->peer_addr.in_addr));
+               con->error_msg = "protocol error, bad banner";
+               return -1;
+       }
+       return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+       switch (ss->ss_family) {
+       case AF_INET:
+               return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+       case AF_INET6:
+               return
+                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+                    ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+       }
+       return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+       switch (ss->ss_family) {
+       case AF_INET:
+               return ntohs(((struct sockaddr_in *)ss)->sin_port);
+       case AF_INET6:
+               return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+       }
+       return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+       switch (ss->ss_family) {
+       case AF_INET:
+               ((struct sockaddr_in *)ss)->sin_port = htons(p);
+       case AF_INET6:
+               ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+       }
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+                  struct ceph_entity_addr *addr,
+                  int max_count, int *count)
+{
+       int i;
+       const char *p = c;
+
+       dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+       for (i = 0; i < max_count; i++) {
+               const char *ipend;
+               struct sockaddr_storage *ss = &addr[i].in_addr;
+               struct sockaddr_in *in4 = (void *)ss;
+               struct sockaddr_in6 *in6 = (void *)ss;
+               int port;
+               char delim = ',';
+
+               if (*p == '[') {
+                       delim = ']';
+                       p++;
+               }
+
+               memset(ss, 0, sizeof(*ss));
+               if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+                            delim, &ipend))
+                       ss->ss_family = AF_INET;
+               else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+                                 delim, &ipend))
+                       ss->ss_family = AF_INET6;
+               else
+                       goto bad;
+               p = ipend;
+
+               if (delim == ']') {
+                       if (*p != ']') {
+                               dout("missing matching ']'\n");
+                               goto bad;
+                       }
+                       p++;
+               }
+
+               /* port? */
+               if (p < end && *p == ':') {
+                       port = 0;
+                       p++;
+                       while (p < end && *p >= '0' && *p <= '9') {
+                               port = (port * 10) + (*p - '0');
+                               p++;
+                       }
+                       if (port > 65535 || port == 0)
+                               goto bad;
+               } else {
+                       port = CEPH_MON_PORT;
+               }
+
+               addr_set_port(ss, port);
+
+               dout("parse_ips got %s\n", ceph_pr_addr(ss));
+
+               if (p == end)
+                       break;
+               if (*p != ',')
+                       goto bad;
+               p++;
+       }
+
+       if (p != end)
+               goto bad;
+
+       if (count)
+               *count = i + 1;
+       return 0;
+
+bad:
+       pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_parse_ips);
+
+static int process_banner(struct ceph_connection *con)
+{
+       dout("process_banner on %p\n", con);
+
+       if (verify_hello(con) < 0)
+               return -1;
+
+       ceph_decode_addr(&con->actual_peer_addr);
+       ceph_decode_addr(&con->peer_addr_for_me);
+
+       /*
+        * Make sure the other end is who we wanted.  note that the other
+        * end may not yet know their ip address, so if it's 0.0.0.0, give
+        * them the benefit of the doubt.
+        */
+       if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+                  sizeof(con->peer_addr)) != 0 &&
+           !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+             con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+               pr_warning("wrong peer, want %s/%d, got %s/%d\n",
+                          ceph_pr_addr(&con->peer_addr.in_addr),
+                          (int)le32_to_cpu(con->peer_addr.nonce),
+                          ceph_pr_addr(&con->actual_peer_addr.in_addr),
+                          (int)le32_to_cpu(con->actual_peer_addr.nonce));
+               con->error_msg = "wrong peer at address";
+               return -1;
+       }
+
+       /*
+        * did we learn our address?
+        */
+       if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+               int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+               memcpy(&con->msgr->inst.addr.in_addr,
+                      &con->peer_addr_for_me.in_addr,
+                      sizeof(con->peer_addr_for_me.in_addr));
+               addr_set_port(&con->msgr->inst.addr.in_addr, port);
+               encode_my_addr(con->msgr);
+               dout("process_banner learned my addr is %s\n",
+                    ceph_pr_addr(&con->msgr->inst.addr.in_addr));
+       }
+
+       set_bit(NEGOTIATING, &con->state);
+       prepare_read_connect(con);
+       return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+       reset_connection(con);
+       set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+       mutex_unlock(&con->mutex);
+       if (con->ops->bad_proto)
+               con->ops->bad_proto(con);
+       mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+       u64 sup_feat = con->msgr->supported_features;
+       u64 req_feat = con->msgr->required_features;
+       u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+       dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+       switch (con->in_reply.tag) {
+       case CEPH_MSGR_TAG_FEATURES:
+               pr_err("%s%lld %s feature set mismatch,"
+                      " my %llx < server's %llx, missing %llx\n",
+                      ENTITY_NAME(con->peer_name),
+                      ceph_pr_addr(&con->peer_addr.in_addr),
+                      sup_feat, server_feat, server_feat & ~sup_feat);
+               con->error_msg = "missing required protocol features";
+               fail_protocol(con);
+               return -1;
+
+       case CEPH_MSGR_TAG_BADPROTOVER:
+               pr_err("%s%lld %s protocol version mismatch,"
+                      " my %d != server's %d\n",
+                      ENTITY_NAME(con->peer_name),
+                      ceph_pr_addr(&con->peer_addr.in_addr),
+                      le32_to_cpu(con->out_connect.protocol_version),
+                      le32_to_cpu(con->in_reply.protocol_version));
+               con->error_msg = "protocol version mismatch";
+               fail_protocol(con);
+               return -1;
+
+       case CEPH_MSGR_TAG_BADAUTHORIZER:
+               con->auth_retry++;
+               dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+                    con->auth_retry);
+               if (con->auth_retry == 2) {
+                       con->error_msg = "connect authorization failure";
+                       reset_connection(con);
+                       set_bit(CLOSED, &con->state);
+                       return -1;
+               }
+               con->auth_retry = 1;
+               prepare_write_connect(con->msgr, con, 0);
+               prepare_read_connect(con);
+               break;
+
+       case CEPH_MSGR_TAG_RESETSESSION:
+               /*
+                * If we connected with a large connect_seq but the peer
+                * has no record of a session with us (no connection, or
+                * connect_seq == 0), they will send RESETSESION to indicate
+                * that they must have reset their session, and may have
+                * dropped messages.
+                */
+               dout("process_connect got RESET peer seq %u\n",
+                    le32_to_cpu(con->in_connect.connect_seq));
+               pr_err("%s%lld %s connection reset\n",
+                      ENTITY_NAME(con->peer_name),
+                      ceph_pr_addr(&con->peer_addr.in_addr));
+               reset_connection(con);
+               prepare_write_connect(con->msgr, con, 0);
+               prepare_read_connect(con);
+
+               /* Tell ceph about it. */
+               mutex_unlock(&con->mutex);
+               pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+               if (con->ops->peer_reset)
+                       con->ops->peer_reset(con);
+               mutex_lock(&con->mutex);
+               break;
+
+       case CEPH_MSGR_TAG_RETRY_SESSION:
+               /*
+                * If we sent a smaller connect_seq than the peer has, try
+                * again with a larger value.
+                */
+               dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+                    le32_to_cpu(con->out_connect.connect_seq),
+                    le32_to_cpu(con->in_connect.connect_seq));
+               con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+               prepare_write_connect(con->msgr, con, 0);
+               prepare_read_connect(con);
+               break;
+
+       case CEPH_MSGR_TAG_RETRY_GLOBAL:
+               /*
+                * If we sent a smaller global_seq than the peer has, try
+                * again with a larger value.
+                */
+               dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+                    con->peer_global_seq,
+                    le32_to_cpu(con->in_connect.global_seq));
+               get_global_seq(con->msgr,
+                              le32_to_cpu(con->in_connect.global_seq));
+               prepare_write_connect(con->msgr, con, 0);
+               prepare_read_connect(con);
+               break;
+
+       case CEPH_MSGR_TAG_READY:
+               if (req_feat & ~server_feat) {
+                       pr_err("%s%lld %s protocol feature mismatch,"
+                              " my required %llx > server's %llx, need %llx\n",
+                              ENTITY_NAME(con->peer_name),
+                              ceph_pr_addr(&con->peer_addr.in_addr),
+                              req_feat, server_feat, req_feat & ~server_feat);
+                       con->error_msg = "missing required protocol features";
+                       fail_protocol(con);
+                       return -1;
+               }
+               clear_bit(CONNECTING, &con->state);
+               con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+               con->connect_seq++;
+               con->peer_features = server_feat;
+               dout("process_connect got READY gseq %d cseq %d (%d)\n",
+                    con->peer_global_seq,
+                    le32_to_cpu(con->in_reply.connect_seq),
+                    con->connect_seq);
+               WARN_ON(con->connect_seq !=
+                       le32_to_cpu(con->in_reply.connect_seq));
+
+               if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+                       set_bit(LOSSYTX, &con->state);
+
+               prepare_read_tag(con);
+               break;
+
+       case CEPH_MSGR_TAG_WAIT:
+               /*
+                * If there is a connection race (we are opening
+                * connections to each other), one of us may just have
+                * to WAIT.  This shouldn't happen if we are the
+                * client.
+                */
+               pr_err("process_connect peer connecting WAIT\n");
+
+       default:
+               pr_err("connect protocol error, will retry\n");
+               con->error_msg = "protocol error, garbage tag during connect";
+               return -1;
+       }
+       return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+       int to = 0;
+
+       return read_partial(con, &to, sizeof(con->in_temp_ack),
+                           &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+       struct ceph_msg *m;
+       u64 ack = le64_to_cpu(con->in_temp_ack);
+       u64 seq;
+
+       while (!list_empty(&con->out_sent)) {
+               m = list_first_entry(&con->out_sent, struct ceph_msg,
+                                    list_head);
+               seq = le64_to_cpu(m->hdr.seq);
+               if (seq > ack)
+                       break;
+               dout("got ack for seq %llu type %d at %p\n", seq,
+                    le16_to_cpu(m->hdr.type), m);
+               ceph_msg_remove(m);
+       }
+       prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+                                       struct kvec *section,
+                                       unsigned int sec_len, u32 *crc)
+{
+       int ret, left;
+
+       BUG_ON(!section);
+
+       while (section->iov_len < sec_len) {
+               BUG_ON(section->iov_base == NULL);
+               left = sec_len - section->iov_len;
+               ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+                                      section->iov_len, left);
+               if (ret <= 0)
+                       return ret;
+               section->iov_len += ret;
+               if (section->iov_len == sec_len)
+                       *crc = crc32c(0, section->iov_base,
+                                     section->iov_len);
+       }
+
+       return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                               struct ceph_msg_header *hdr,
+                               int *skip);
+
+
+static int read_partial_message_pages(struct ceph_connection *con,
+                                     struct page **pages,
+                                     unsigned data_len, int datacrc)
+{
+       void *p;
+       int ret;
+       int left;
+
+       left = min((int)(data_len - con->in_msg_pos.data_pos),
+                  (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+       /* (page) data */
+       BUG_ON(pages == NULL);
+       p = kmap(pages[con->in_msg_pos.page]);
+       ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                              left);
+       if (ret > 0 && datacrc)
+               con->in_data_crc =
+                       crc32c(con->in_data_crc,
+                                 p + con->in_msg_pos.page_pos, ret);
+       kunmap(pages[con->in_msg_pos.page]);
+       if (ret <= 0)
+               return ret;
+       con->in_msg_pos.data_pos += ret;
+       con->in_msg_pos.page_pos += ret;
+       if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+               con->in_msg_pos.page_pos = 0;
+               con->in_msg_pos.page++;
+       }
+
+       return ret;
+}
+
+#ifdef CONFIG_BLOCK
+static int read_partial_message_bio(struct ceph_connection *con,
+                                   struct bio **bio_iter, int *bio_seg,
+                                   unsigned data_len, int datacrc)
+{
+       struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
+       void *p;
+       int ret, left;
+
+       if (IS_ERR(bv))
+               return PTR_ERR(bv);
+
+       left = min((int)(data_len - con->in_msg_pos.data_pos),
+                  (int)(bv->bv_len - con->in_msg_pos.page_pos));
+
+       p = kmap(bv->bv_page) + bv->bv_offset;
+
+       ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+                              left);
+       if (ret > 0 && datacrc)
+               con->in_data_crc =
+                       crc32c(con->in_data_crc,
+                                 p + con->in_msg_pos.page_pos, ret);
+       kunmap(bv->bv_page);
+       if (ret <= 0)
+               return ret;
+       con->in_msg_pos.data_pos += ret;
+       con->in_msg_pos.page_pos += ret;
+       if (con->in_msg_pos.page_pos == bv->bv_len) {
+               con->in_msg_pos.page_pos = 0;
+               iter_bio_next(bio_iter, bio_seg);
+       }
+
+       return ret;
+}
+#endif
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+       struct ceph_msg *m = con->in_msg;
+       int ret;
+       int to, left;
+       unsigned front_len, middle_len, data_len, data_off;
+       int datacrc = con->msgr->nocrc;
+       int skip;
+       u64 seq;
+
+       dout("read_partial_message con %p msg %p\n", con, m);
+
+       /* header */
+       while (con->in_base_pos < sizeof(con->in_hdr)) {
+               left = sizeof(con->in_hdr) - con->in_base_pos;
+               ret = ceph_tcp_recvmsg(con->sock,
+                                      (char *)&con->in_hdr + con->in_base_pos,
+                                      left);
+               if (ret <= 0)
+                       return ret;
+               con->in_base_pos += ret;
+               if (con->in_base_pos == sizeof(con->in_hdr)) {
+                       u32 crc = crc32c(0, (void *)&con->in_hdr,
+                                sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+                       if (crc != le32_to_cpu(con->in_hdr.crc)) {
+                               pr_err("read_partial_message bad hdr "
+                                      " crc %u != expected %u\n",
+                                      crc, con->in_hdr.crc);
+                               return -EBADMSG;
+                       }
+               }
+       }
+       front_len = le32_to_cpu(con->in_hdr.front_len);
+       if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+               return -EIO;
+       middle_len = le32_to_cpu(con->in_hdr.middle_len);
+       if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+               return -EIO;
+       data_len = le32_to_cpu(con->in_hdr.data_len);
+       if (data_len > CEPH_MSG_MAX_DATA_LEN)
+               return -EIO;
+       data_off = le16_to_cpu(con->in_hdr.data_off);
+
+       /* verify seq# */
+       seq = le64_to_cpu(con->in_hdr.seq);
+       if ((s64)seq - (s64)con->in_seq < 1) {
+               pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+                       ENTITY_NAME(con->peer_name),
+                       ceph_pr_addr(&con->peer_addr.in_addr),
+                       seq, con->in_seq + 1);
+               con->in_base_pos = -front_len - middle_len - data_len -
+                       sizeof(m->footer);
+               con->in_tag = CEPH_MSGR_TAG_READY;
+               con->in_seq++;
+               return 0;
+       } else if ((s64)seq - (s64)con->in_seq > 1) {
+               pr_err("read_partial_message bad seq %lld expected %lld\n",
+                      seq, con->in_seq + 1);
+               con->error_msg = "bad message sequence # for incoming message";
+               return -EBADMSG;
+       }
+
+       /* allocate message? */
+       if (!con->in_msg) {
+               dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+                    con->in_hdr.front_len, con->in_hdr.data_len);
+               skip = 0;
+               con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+               if (skip) {
+                       /* skip this message */
+                       dout("alloc_msg said skip message\n");
+                       BUG_ON(con->in_msg);
+                       con->in_base_pos = -front_len - middle_len - data_len -
+                               sizeof(m->footer);
+                       con->in_tag = CEPH_MSGR_TAG_READY;
+                       con->in_seq++;
+                       return 0;
+               }
+               if (!con->in_msg) {
+                       con->error_msg =
+                               "error allocating memory for incoming message";
+                       return -ENOMEM;
+               }
+               m = con->in_msg;
+               m->front.iov_len = 0;    /* haven't read it yet */
+               if (m->middle)
+                       m->middle->vec.iov_len = 0;
+
+               con->in_msg_pos.page = 0;
+               if (m->pages)
+                       con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+               else
+                       con->in_msg_pos.page_pos = 0;
+               con->in_msg_pos.data_pos = 0;
+       }
+
+       /* front */
+       ret = read_partial_message_section(con, &m->front, front_len,
+                                          &con->in_front_crc);
+       if (ret <= 0)
+               return ret;
+
+       /* middle */
+       if (m->middle) {
+               ret = read_partial_message_section(con, &m->middle->vec,
+                                                  middle_len,
+                                                  &con->in_middle_crc);
+               if (ret <= 0)
+                       return ret;
+       }
+#ifdef CONFIG_BLOCK
+       if (m->bio && !m->bio_iter)
+               init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
+#endif
+
+       /* (page) data */
+       while (con->in_msg_pos.data_pos < data_len) {
+               if (m->pages) {
+                       ret = read_partial_message_pages(con, m->pages,
+                                                data_len, datacrc);
+                       if (ret <= 0)
+                               return ret;
+#ifdef CONFIG_BLOCK
+               } else if (m->bio) {
+
+                       ret = read_partial_message_bio(con,
+                                                &m->bio_iter, &m->bio_seg,
+                                                data_len, datacrc);
+                       if (ret <= 0)
+                               return ret;
+#endif
+               } else {
+                       BUG_ON(1);
+               }
+       }
+
+       /* footer */
+       to = sizeof(m->hdr) + sizeof(m->footer);
+       while (con->in_base_pos < to) {
+               left = to - con->in_base_pos;
+               ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+                                      (con->in_base_pos - sizeof(m->hdr)),
+                                      left);
+               if (ret <= 0)
+                       return ret;
+               con->in_base_pos += ret;
+       }
+       dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+            m, front_len, m->footer.front_crc, middle_len,
+            m->footer.middle_crc, data_len, m->footer.data_crc);
+
+       /* crc ok? */
+       if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+               pr_err("read_partial_message %p front crc %u != exp. %u\n",
+                      m, con->in_front_crc, m->footer.front_crc);
+               return -EBADMSG;
+       }
+       if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+               pr_err("read_partial_message %p middle crc %u != exp %u\n",
+                      m, con->in_middle_crc, m->footer.middle_crc);
+               return -EBADMSG;
+       }
+       if (datacrc &&
+           (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+           con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+               pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+                      con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+               return -EBADMSG;
+       }
+
+       return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+       struct ceph_msg *msg;
+
+       msg = con->in_msg;
+       con->in_msg = NULL;
+
+       /* if first message, set peer_name */
+       if (con->peer_name.type == 0)
+               con->peer_name = msg->hdr.src;
+
+       con->in_seq++;
+       mutex_unlock(&con->mutex);
+
+       dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+            msg, le64_to_cpu(msg->hdr.seq),
+            ENTITY_NAME(msg->hdr.src),
+            le16_to_cpu(msg->hdr.type),
+            ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+            le32_to_cpu(msg->hdr.front_len),
+            le32_to_cpu(msg->hdr.data_len),
+            con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+       con->ops->dispatch(con, msg);
+
+       mutex_lock(&con->mutex);
+       prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+       struct ceph_messenger *msgr = con->msgr;
+       int ret = 1;
+
+       dout("try_write start %p state %lu nref %d\n", con, con->state,
+            atomic_read(&con->nref));
+
+more:
+       dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+       /* open the socket first? */
+       if (con->sock == NULL) {
+               /*
+                * if we were STANDBY and are reconnecting _this_
+                * connection, bump connect_seq now.  Always bump
+                * global_seq.
+                */
+               if (test_and_clear_bit(STANDBY, &con->state))
+                       con->connect_seq++;
+
+               prepare_write_banner(msgr, con);
+               prepare_write_connect(msgr, con, 1);
+               prepare_read_banner(con);
+               set_bit(CONNECTING, &con->state);
+               clear_bit(NEGOTIATING, &con->state);
+
+               BUG_ON(con->in_msg);
+               con->in_tag = CEPH_MSGR_TAG_READY;
+               dout("try_write initiating connect on %p new state %lu\n",
+                    con, con->state);
+               con->sock = ceph_tcp_connect(con);
+               if (IS_ERR(con->sock)) {
+                       con->sock = NULL;
+                       con->error_msg = "connect error";
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+more_kvec:
+       /* kvec data queued? */
+       if (con->out_skip) {
+               ret = write_partial_skip(con);
+               if (ret <= 0)
+                       goto done;
+               if (ret < 0) {
+                       dout("try_write write_partial_skip err %d\n", ret);
+                       goto done;
+               }
+       }
+       if (con->out_kvec_left) {
+               ret = write_partial_kvec(con);
+               if (ret <= 0)
+                       goto done;
+       }
+
+       /* msg pages? */
+       if (con->out_msg) {
+               if (con->out_msg_done) {
+                       ceph_msg_put(con->out_msg);
+                       con->out_msg = NULL;   /* we're done with this one */
+                       goto do_next;
+               }
+
+               ret = write_partial_msg_pages(con);
+               if (ret == 1)
+                       goto more_kvec;  /* we need to send the footer, too! */
+               if (ret == 0)
+                       goto done;
+               if (ret < 0) {
+                       dout("try_write write_partial_msg_pages err %d\n",
+                            ret);
+                       goto done;
+               }
+       }
+
+do_next:
+       if (!test_bit(CONNECTING, &con->state)) {
+               /* is anything else pending? */
+               if (!list_empty(&con->out_queue)) {
+                       prepare_write_message(con);
+                       goto more;
+               }
+               if (con->in_seq > con->in_seq_acked) {
+                       prepare_write_ack(con);
+                       goto more;
+               }
+               if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+                       prepare_write_keepalive(con);
+                       goto more;
+               }
+       }
+
+       /* Nothing to do! */
+       clear_bit(WRITE_PENDING, &con->state);
+       dout("try_write nothing else to write.\n");
+done:
+       ret = 0;
+out:
+       dout("try_write done on %p\n", con);
+       return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+       int ret = -1;
+
+       if (!con->sock)
+               return 0;
+
+       if (test_bit(STANDBY, &con->state))
+               return 0;
+
+       dout("try_read start on %p\n", con);
+
+more:
+       dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+            con->in_base_pos);
+       if (test_bit(CONNECTING, &con->state)) {
+               if (!test_bit(NEGOTIATING, &con->state)) {
+                       dout("try_read connecting\n");
+                       ret = read_partial_banner(con);
+                       if (ret <= 0)
+                               goto done;
+                       if (process_banner(con) < 0) {
+                               ret = -1;
+                               goto out;
+                       }
+               }
+               ret = read_partial_connect(con);
+               if (ret <= 0)
+                       goto done;
+               if (process_connect(con) < 0) {
+                       ret = -1;
+                       goto out;
+               }
+               goto more;
+       }
+
+       if (con->in_base_pos < 0) {
+               /*
+                * skipping + discarding content.
+                *
+                * FIXME: there must be a better way to do this!
+                */
+               static char buf[1024];
+               int skip = min(1024, -con->in_base_pos);
+               dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+               ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+               if (ret <= 0)
+                       goto done;
+               con->in_base_pos += ret;
+               if (con->in_base_pos)
+                       goto more;
+       }
+       if (con->in_tag == CEPH_MSGR_TAG_READY) {
+               /*
+                * what's next?
+                */
+               ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+               if (ret <= 0)
+                       goto done;
+               dout("try_read got tag %d\n", (int)con->in_tag);
+               switch (con->in_tag) {
+               case CEPH_MSGR_TAG_MSG:
+                       prepare_read_message(con);
+                       break;
+               case CEPH_MSGR_TAG_ACK:
+                       prepare_read_ack(con);
+                       break;
+               case CEPH_MSGR_TAG_CLOSE:
+                       set_bit(CLOSED, &con->state);   /* fixme */
+                       goto done;
+               default:
+                       goto bad_tag;
+               }
+       }
+       if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+               ret = read_partial_message(con);
+               if (ret <= 0) {
+                       switch (ret) {
+                       case -EBADMSG:
+                               con->error_msg = "bad crc";
+                               ret = -EIO;
+                               goto out;
+                       case -EIO:
+                               con->error_msg = "io error";
+                               goto out;
+                       default:
+                               goto done;
+                       }
+               }
+               if (con->in_tag == CEPH_MSGR_TAG_READY)
+                       goto more;
+               process_message(con);
+               goto more;
+       }
+       if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+               ret = read_partial_ack(con);
+               if (ret <= 0)
+                       goto done;
+               process_ack(con);
+               goto more;
+       }
+
+done:
+       ret = 0;
+out:
+       dout("try_read done on %p\n", con);
+       return ret;
+
+bad_tag:
+       pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+       con->error_msg = "protocol error, garbage tag";
+       ret = -1;
+       goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+       if (test_bit(DEAD, &con->state)) {
+               dout("queue_con %p ignoring: DEAD\n",
+                    con);
+               return;
+       }
+
+       if (!con->ops->get(con)) {
+               dout("queue_con %p ref count 0\n", con);
+               return;
+       }
+
+       set_bit(QUEUED, &con->state);
+       if (test_bit(BUSY, &con->state)) {
+               dout("queue_con %p - already BUSY\n", con);
+               con->ops->put(con);
+       } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+               dout("queue_con %p - already queued\n", con);
+               con->ops->put(con);
+       } else {
+               dout("queue_con %p\n", con);
+       }
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+       struct ceph_connection *con = container_of(work, struct ceph_connection,
+                                                  work.work);
+       int backoff = 0;
+
+more:
+       if (test_and_set_bit(BUSY, &con->state) != 0) {
+               dout("con_work %p BUSY already set\n", con);
+               goto out;
+       }
+       dout("con_work %p start, clearing QUEUED\n", con);
+       clear_bit(QUEUED, &con->state);
+
+       mutex_lock(&con->mutex);
+
+       if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+               dout("con_work CLOSED\n");
+               con_close_socket(con);
+               goto done;
+       }
+       if (test_and_clear_bit(OPENING, &con->state)) {
+               /* reopen w/ new peer */
+               dout("con_work OPENING\n");
+               con_close_socket(con);
+       }
+
+       if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+           try_read(con) < 0 ||
+           try_write(con) < 0) {
+               mutex_unlock(&con->mutex);
+               backoff = 1;
+               ceph_fault(con);     /* error/fault path */
+               goto done_unlocked;
+       }
+
+done:
+       mutex_unlock(&con->mutex);
+
+done_unlocked:
+       clear_bit(BUSY, &con->state);
+       dout("con->state=%lu\n", con->state);
+       if (test_bit(QUEUED, &con->state)) {
+               if (!backoff || test_bit(OPENING, &con->state)) {
+                       dout("con_work %p QUEUED reset, looping\n", con);
+                       goto more;
+               }
+               dout("con_work %p QUEUED reset, but just faulted\n", con);
+               clear_bit(QUEUED, &con->state);
+       }
+       dout("con_work %p done\n", con);
+
+out:
+       con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+       pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+              ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+       dout("fault %p state %lu to peer %s\n",
+            con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+
+       if (test_bit(LOSSYTX, &con->state)) {
+               dout("fault on LOSSYTX channel\n");
+               goto out;
+       }
+
+       mutex_lock(&con->mutex);
+       if (test_bit(CLOSED, &con->state))
+               goto out_unlock;
+
+       con_close_socket(con);
+
+       if (con->in_msg) {
+               ceph_msg_put(con->in_msg);
+               con->in_msg = NULL;
+       }
+
+       /* Requeue anything that hasn't been acked */
+       list_splice_init(&con->out_sent, &con->out_queue);
+
+       /* If there are no messages in the queue, place the connection
+        * in a STANDBY state (i.e., don't try to reconnect just yet). */
+       if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+               dout("fault setting STANDBY\n");
+               set_bit(STANDBY, &con->state);
+       } else {
+               /* retry after a delay. */
+               if (con->delay == 0)
+                       con->delay = BASE_DELAY_INTERVAL;
+               else if (con->delay < MAX_DELAY_INTERVAL)
+                       con->delay *= 2;
+               dout("fault queueing %p delay %lu\n", con, con->delay);
+               con->ops->get(con);
+               if (queue_delayed_work(ceph_msgr_wq, &con->work,
+                                      round_jiffies_relative(con->delay)) == 0)
+                       con->ops->put(con);
+       }
+
+out_unlock:
+       mutex_unlock(&con->mutex);
+out:
+       /*
+        * in case we faulted due to authentication, invalidate our
+        * current tickets so that we can get new ones.
+        */
+       if (con->auth_retry && con->ops->invalidate_authorizer) {
+               dout("calling invalidate_authorizer()\n");
+               con->ops->invalidate_authorizer(con);
+       }
+
+       if (con->ops->fault)
+               con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
+                                            u32 supported_features,
+                                            u32 required_features)
+{
+       struct ceph_messenger *msgr;
+
+       msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+       if (msgr == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       msgr->supported_features = supported_features;
+       msgr->required_features = required_features;
+
+       spin_lock_init(&msgr->global_seq_lock);
+
+       /* the zero page is needed if a request is "canceled" while the message
+        * is being written over the socket */
+       msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+       if (!msgr->zero_page) {
+               kfree(msgr);
+               return ERR_PTR(-ENOMEM);
+       }
+       kmap(msgr->zero_page);
+
+       if (myaddr)
+               msgr->inst.addr = *myaddr;
+
+       /* select a random nonce */
+       msgr->inst.addr.type = 0;
+       get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+       encode_my_addr(msgr);
+
+       dout("messenger_create %p\n", msgr);
+       return msgr;
+}
+EXPORT_SYMBOL(ceph_messenger_create);
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+       dout("destroy %p\n", msgr);
+       kunmap(msgr->zero_page);
+       __free_page(msgr->zero_page);
+       kfree(msgr);
+       dout("destroyed messenger %p\n", msgr);
+}
+EXPORT_SYMBOL(ceph_messenger_destroy);
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       if (test_bit(CLOSED, &con->state)) {
+               dout("con_send %p closed, dropping %p\n", con, msg);
+               ceph_msg_put(msg);
+               return;
+       }
+
+       /* set src+dst */
+       msg->hdr.src = con->msgr->inst.name;
+
+       BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+       msg->needs_out_seq = true;
+
+       /* queue */
+       mutex_lock(&con->mutex);
+       BUG_ON(!list_empty(&msg->list_head));
+       list_add_tail(&msg->list_head, &con->out_queue);
+       dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+            ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+            ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+            le32_to_cpu(msg->hdr.front_len),
+            le32_to_cpu(msg->hdr.middle_len),
+            le32_to_cpu(msg->hdr.data_len));
+       mutex_unlock(&con->mutex);
+
+       /* if there wasn't anything waiting to send before, queue
+        * new work */
+       if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+               queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_send);
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       mutex_lock(&con->mutex);
+       if (!list_empty(&msg->list_head)) {
+               dout("con_revoke %p msg %p - was on queue\n", con, msg);
+               list_del_init(&msg->list_head);
+               ceph_msg_put(msg);
+               msg->hdr.seq = 0;
+       }
+       if (con->out_msg == msg) {
+               dout("con_revoke %p msg %p - was sending\n", con, msg);
+               con->out_msg = NULL;
+               if (con->out_kvec_is_msg) {
+                       con->out_skip = con->out_kvec_bytes;
+                       con->out_kvec_is_msg = false;
+               }
+               ceph_msg_put(msg);
+               msg->hdr.seq = 0;
+       }
+       mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       mutex_lock(&con->mutex);
+       if (con->in_msg && con->in_msg == msg) {
+               unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+               unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+               unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+               /* skip rest of message */
+               dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+                       con->in_base_pos = con->in_base_pos -
+                               sizeof(struct ceph_msg_header) -
+                               front_len -
+                               middle_len -
+                               data_len -
+                               sizeof(struct ceph_msg_footer);
+               ceph_msg_put(con->in_msg);
+               con->in_msg = NULL;
+               con->in_tag = CEPH_MSGR_TAG_READY;
+               con->in_seq++;
+       } else {
+               dout("con_revoke_pages %p msg %p pages %p no-op\n",
+                    con, con->in_msg, msg);
+       }
+       mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+       if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+           test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+               queue_con(con);
+}
+EXPORT_SYMBOL(ceph_con_keepalive);
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
+{
+       struct ceph_msg *m;
+
+       m = kmalloc(sizeof(*m), flags);
+       if (m == NULL)
+               goto out;
+       kref_init(&m->kref);
+       INIT_LIST_HEAD(&m->list_head);
+
+       m->hdr.tid = 0;
+       m->hdr.type = cpu_to_le16(type);
+       m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+       m->hdr.version = 0;
+       m->hdr.front_len = cpu_to_le32(front_len);
+       m->hdr.middle_len = 0;
+       m->hdr.data_len = 0;
+       m->hdr.data_off = 0;
+       m->hdr.reserved = 0;
+       m->footer.front_crc = 0;
+       m->footer.middle_crc = 0;
+       m->footer.data_crc = 0;
+       m->footer.flags = 0;
+       m->front_max = front_len;
+       m->front_is_vmalloc = false;
+       m->more_to_follow = false;
+       m->pool = NULL;
+
+       /* front */
+       if (front_len) {
+               if (front_len > PAGE_CACHE_SIZE) {
+                       m->front.iov_base = __vmalloc(front_len, flags,
+                                                     PAGE_KERNEL);
+                       m->front_is_vmalloc = true;
+               } else {
+                       m->front.iov_base = kmalloc(front_len, flags);
+               }
+               if (m->front.iov_base == NULL) {
+                       pr_err("msg_new can't allocate %d bytes\n",
+                            front_len);
+                       goto out2;
+               }
+       } else {
+               m->front.iov_base = NULL;
+       }
+       m->front.iov_len = front_len;
+
+       /* middle */
+       m->middle = NULL;
+
+       /* data */
+       m->nr_pages = 0;
+       m->pages = NULL;
+       m->pagelist = NULL;
+       m->bio = NULL;
+       m->bio_iter = NULL;
+       m->bio_seg = 0;
+       m->trail = NULL;
+
+       dout("ceph_msg_new %p front %d\n", m, front_len);
+       return m;
+
+out2:
+       ceph_msg_put(m);
+out:
+       pr_err("msg_new can't create type %d front %d\n", type, front_len);
+       return NULL;
+}
+EXPORT_SYMBOL(ceph_msg_new);
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       int type = le16_to_cpu(msg->hdr.type);
+       int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+       dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+            ceph_msg_type_name(type), middle_len);
+       BUG_ON(!middle_len);
+       BUG_ON(msg->middle);
+
+       msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+       if (!msg->middle)
+               return -ENOMEM;
+       return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+                               struct ceph_msg_header *hdr,
+                               int *skip)
+{
+       int type = le16_to_cpu(hdr->type);
+       int front_len = le32_to_cpu(hdr->front_len);
+       int middle_len = le32_to_cpu(hdr->middle_len);
+       struct ceph_msg *msg = NULL;
+       int ret;
+
+       if (con->ops->alloc_msg) {
+               mutex_unlock(&con->mutex);
+               msg = con->ops->alloc_msg(con, hdr, skip);
+               mutex_lock(&con->mutex);
+               if (!msg || *skip)
+                       return NULL;
+       }
+       if (!msg) {
+               *skip = 0;
+               msg = ceph_msg_new(type, front_len, GFP_NOFS);
+               if (!msg) {
+                       pr_err("unable to allocate msg type %d len %d\n",
+                              type, front_len);
+                       return NULL;
+               }
+       }
+       memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+       if (middle_len && !msg->middle) {
+               ret = ceph_alloc_middle(con, msg);
+               if (ret < 0) {
+                       ceph_msg_put(msg);
+                       return NULL;
+               }
+       }
+
+       return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+       dout("msg_kfree %p\n", m);
+       if (m->front_is_vmalloc)
+               vfree(m->front.iov_base);
+       else
+               kfree(m->front.iov_base);
+       kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+       struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+       dout("ceph_msg_put last one on %p\n", m);
+       WARN_ON(!list_empty(&m->list_head));
+
+       /* drop middle, data, if any */
+       if (m->middle) {
+               ceph_buffer_put(m->middle);
+               m->middle = NULL;
+       }
+       m->nr_pages = 0;
+       m->pages = NULL;
+
+       if (m->pagelist) {
+               ceph_pagelist_release(m->pagelist);
+               kfree(m->pagelist);
+               m->pagelist = NULL;
+       }
+
+       m->trail = NULL;
+
+       if (m->pool)
+               ceph_msgpool_put(m->pool, m);
+       else
+               ceph_msg_kfree(m);
+}
+EXPORT_SYMBOL(ceph_msg_last_put);
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+       pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+                msg->front_max, msg->nr_pages);
+       print_hex_dump(KERN_DEBUG, "header: ",
+                      DUMP_PREFIX_OFFSET, 16, 1,
+                      &msg->hdr, sizeof(msg->hdr), true);
+       print_hex_dump(KERN_DEBUG, " front: ",
+                      DUMP_PREFIX_OFFSET, 16, 1,
+                      msg->front.iov_base, msg->front.iov_len, true);
+       if (msg->middle)
+               print_hex_dump(KERN_DEBUG, "middle: ",
+                              DUMP_PREFIX_OFFSET, 16, 1,
+                              msg->middle->vec.iov_base,
+                              msg->middle->vec.iov_len, true);
+       print_hex_dump(KERN_DEBUG, "footer: ",
+                      DUMP_PREFIX_OFFSET, 16, 1,
+                      &msg->footer, sizeof(msg->footer), true);
+}
+EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c

new file mode 100644 (file)

index 0000000..8a07939
--- /dev/null
+++ b/net/ceph/mon_client.c
@@ -0,0 +1,1027 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/ceph/auth.h>
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+static const struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+       struct ceph_monmap *m = NULL;
+       int i, err = -EINVAL;
+       struct ceph_fsid fsid;
+       u32 epoch, num_mon;
+       u16 version;
+       u32 len;
+
+       ceph_decode_32_safe(&p, end, len, bad);
+       ceph_decode_need(&p, end, len, bad);
+
+       dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+       ceph_decode_16_safe(&p, end, version, bad);
+
+       ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+       ceph_decode_copy(&p, &fsid, sizeof(fsid));
+       epoch = ceph_decode_32(&p);
+
+       num_mon = ceph_decode_32(&p);
+       ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+       if (num_mon >= CEPH_MAX_MON)
+               goto bad;
+       m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+       if (m == NULL)
+               return ERR_PTR(-ENOMEM);
+       m->fsid = fsid;
+       m->epoch = epoch;
+       m->num_mon = num_mon;
+       ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+       for (i = 0; i < num_mon; i++)
+               ceph_decode_addr(&m->mon_inst[i].addr);
+
+       dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+            m->num_mon);
+       for (i = 0; i < m->num_mon; i++)
+               dout("monmap_decode  mon%d is %s\n", i,
+                    ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
+       return m;
+
+bad:
+       dout("monmap_decode failed with %d\n", err);
+       kfree(m);
+       return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+       int i;
+
+       for (i = 0; i < m->num_mon; i++)
+               if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+                       return 1;
+       return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+       monc->pending_auth = 1;
+       monc->m_auth->front.iov_len = len;
+       monc->m_auth->hdr.front_len = cpu_to_le32(len);
+       ceph_con_revoke(monc->con, monc->m_auth);
+       ceph_msg_get(monc->m_auth);  /* keep our ref */
+       ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+       if (monc->con) {
+               dout("__close_session closing mon%d\n", monc->cur_mon);
+               ceph_con_revoke(monc->con, monc->m_auth);
+               ceph_con_close(monc->con);
+               monc->cur_mon = -1;
+               monc->pending_auth = 0;
+               ceph_auth_reset(monc->auth);
+       }
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+       char r;
+       int ret;
+
+       if (monc->cur_mon < 0) {
+               get_random_bytes(&r, 1);
+               monc->cur_mon = r % monc->monmap->num_mon;
+               dout("open_session num=%d r=%d -> mon%d\n",
+                    monc->monmap->num_mon, r, monc->cur_mon);
+               monc->sub_sent = 0;
+               monc->sub_renew_after = jiffies;  /* i.e., expired */
+               monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+               dout("open_session mon%d opening\n", monc->cur_mon);
+               monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+               monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+               ceph_con_open(monc->con,
+                             &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+               /* initiatiate authentication handshake */
+               ret = ceph_auth_build_hello(monc->auth,
+                                           monc->m_auth->front.iov_base,
+                                           monc->m_auth->front_max);
+               __send_prepared_auth_request(monc, ret);
+       } else {
+               dout("open_session mon%d already open\n", monc->cur_mon);
+       }
+       return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+       return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+       unsigned delay;
+
+       if (monc->cur_mon < 0 || __sub_expired(monc))
+               delay = 10 * HZ;
+       else
+               delay = 20 * HZ;
+       dout("__schedule_delayed after %u\n", delay);
+       schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+       dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+            (unsigned)monc->sub_sent, __sub_expired(monc),
+            monc->want_next_osdmap);
+       if ((__sub_expired(monc) && !monc->sub_sent) ||
+           monc->want_next_osdmap == 1) {
+               struct ceph_msg *msg = monc->m_subscribe;
+               struct ceph_mon_subscribe_item *i;
+               void *p, *end;
+               int num;
+
+               p = msg->front.iov_base;
+               end = p + msg->front_max;
+
+               num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
+               ceph_encode_32(&p, num);
+
+               if (monc->want_next_osdmap) {
+                       dout("__send_subscribe to 'osdmap' %u\n",
+                            (unsigned)monc->have_osdmap);
+                       ceph_encode_string(&p, end, "osdmap", 6);
+                       i = p;
+                       i->have = cpu_to_le64(monc->have_osdmap);
+                       i->onetime = 1;
+                       p += sizeof(*i);
+                       monc->want_next_osdmap = 2;  /* requested */
+               }
+               if (monc->want_mdsmap) {
+                       dout("__send_subscribe to 'mdsmap' %u+\n",
+                            (unsigned)monc->have_mdsmap);
+                       ceph_encode_string(&p, end, "mdsmap", 6);
+                       i = p;
+                       i->have = cpu_to_le64(monc->have_mdsmap);
+                       i->onetime = 0;
+                       p += sizeof(*i);
+               }
+               ceph_encode_string(&p, end, "monmap", 6);
+               i = p;
+               i->have = 0;
+               i->onetime = 0;
+               p += sizeof(*i);
+
+               msg->front.iov_len = p - msg->front.iov_base;
+               msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+               ceph_con_revoke(monc->con, msg);
+               ceph_con_send(monc->con, ceph_msg_get(msg));
+
+               monc->sub_sent = jiffies | 1;  /* never 0 */
+       }
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+                                struct ceph_msg *msg)
+{
+       unsigned seconds;
+       struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+       if (msg->front.iov_len < sizeof(*h))
+               goto bad;
+       seconds = le32_to_cpu(h->duration);
+
+       mutex_lock(&monc->mutex);
+       if (monc->hunting) {
+               pr_info("mon%d %s session established\n",
+                       monc->cur_mon,
+                       ceph_pr_addr(&monc->con->peer_addr.in_addr));
+               monc->hunting = false;
+       }
+       dout("handle_subscribe_ack after %d seconds\n", seconds);
+       monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+       monc->sub_sent = 0;
+       mutex_unlock(&monc->mutex);
+       return;
+bad:
+       pr_err("got corrupt subscribe-ack msg\n");
+       ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+       mutex_lock(&monc->mutex);
+       monc->have_mdsmap = got;
+       mutex_unlock(&monc->mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_monc_got_mdsmap);
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+       mutex_lock(&monc->mutex);
+       monc->have_osdmap = got;
+       monc->want_next_osdmap = 0;
+       mutex_unlock(&monc->mutex);
+       return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+       dout("request_next_osdmap have %u\n", monc->have_osdmap);
+       mutex_lock(&monc->mutex);
+       if (!monc->want_next_osdmap)
+               monc->want_next_osdmap = 1;
+       if (monc->want_next_osdmap < 2)
+               __send_subscribe(monc);
+       mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+       if (!monc->con) {
+               monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+               if (!monc->con)
+                       return -ENOMEM;
+               ceph_con_init(monc->client->msgr, monc->con);
+               monc->con->private = monc;
+               monc->con->ops = &mon_con_ops;
+       }
+
+       mutex_lock(&monc->mutex);
+       __open_session(monc);
+       __schedule_delayed(monc);
+       mutex_unlock(&monc->mutex);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_monc_open_session);
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+                                struct ceph_msg *msg)
+{
+       struct ceph_client *client = monc->client;
+       struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+       void *p, *end;
+
+       mutex_lock(&monc->mutex);
+
+       dout("handle_monmap\n");
+       p = msg->front.iov_base;
+       end = p + msg->front.iov_len;
+
+       monmap = ceph_monmap_decode(p, end);
+       if (IS_ERR(monmap)) {
+               pr_err("problem decoding monmap, %d\n",
+                      (int)PTR_ERR(monmap));
+               goto out;
+       }
+
+       if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+               kfree(monmap);
+               goto out;
+       }
+
+       client->monc.monmap = monmap;
+       kfree(old);
+
+out:
+       mutex_unlock(&monc->mutex);
+       wake_up_all(&client->auth_wq);
+}
+
+/*
+ * generic requests (e.g., statfs, poolop)
+ */
+static struct ceph_mon_generic_request *__lookup_generic_req(
+       struct ceph_mon_client *monc, u64 tid)
+{
+       struct ceph_mon_generic_request *req;
+       struct rb_node *n = monc->generic_request_tree.rb_node;
+
+       while (n) {
+               req = rb_entry(n, struct ceph_mon_generic_request, node);
+               if (tid < req->tid)
+                       n = n->rb_left;
+               else if (tid > req->tid)
+                       n = n->rb_right;
+               else
+                       return req;
+       }
+       return NULL;
+}
+
+static void __insert_generic_request(struct ceph_mon_client *monc,
+                           struct ceph_mon_generic_request *new)
+{
+       struct rb_node **p = &monc->generic_request_tree.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_mon_generic_request *req = NULL;
+
+       while (*p) {
+               parent = *p;
+               req = rb_entry(parent, struct ceph_mon_generic_request, node);
+               if (new->tid < req->tid)
+                       p = &(*p)->rb_left;
+               else if (new->tid > req->tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+       struct ceph_mon_generic_request *req =
+               container_of(kref, struct ceph_mon_generic_request, kref);
+
+       if (req->reply)
+               ceph_msg_put(req->reply);
+       if (req->request)
+               ceph_msg_put(req->request);
+
+       kfree(req);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+       kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+       kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                        struct ceph_msg_header *hdr,
+                                        int *skip)
+{
+       struct ceph_mon_client *monc = con->private;
+       struct ceph_mon_generic_request *req;
+       u64 tid = le64_to_cpu(hdr->tid);
+       struct ceph_msg *m;
+
+       mutex_lock(&monc->mutex);
+       req = __lookup_generic_req(monc, tid);
+       if (!req) {
+               dout("get_generic_reply %lld dne\n", tid);
+               *skip = 1;
+               m = NULL;
+       } else {
+               dout("get_generic_reply %lld got %p\n", tid, req->reply);
+               m = ceph_msg_get(req->reply);
+               /*
+                * we don't need to track the connection reading into
+                * this reply because we only have one open connection
+                * at a time, ever.
+                */
+       }
+       mutex_unlock(&monc->mutex);
+       return m;
+}
+
+static int do_generic_request(struct ceph_mon_client *monc,
+                             struct ceph_mon_generic_request *req)
+{
+       int err;
+
+       /* register request */
+       mutex_lock(&monc->mutex);
+       req->tid = ++monc->last_tid;
+       req->request->hdr.tid = cpu_to_le64(req->tid);
+       __insert_generic_request(monc, req);
+       monc->num_generic_requests++;
+       ceph_con_send(monc->con, ceph_msg_get(req->request));
+       mutex_unlock(&monc->mutex);
+
+       err = wait_for_completion_interruptible(&req->completion);
+
+       mutex_lock(&monc->mutex);
+       rb_erase(&req->node, &monc->generic_request_tree);
+       monc->num_generic_requests--;
+       mutex_unlock(&monc->mutex);
+
+       if (!err)
+               err = req->result;
+       return err;
+}
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+                               struct ceph_msg *msg)
+{
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+       u64 tid = le64_to_cpu(msg->hdr.tid);
+
+       if (msg->front.iov_len != sizeof(*reply))
+               goto bad;
+       dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+       mutex_lock(&monc->mutex);
+       req = __lookup_generic_req(monc, tid);
+       if (req) {
+               *(struct ceph_statfs *)req->buf = reply->st;
+               req->result = 0;
+               get_generic_request(req);
+       }
+       mutex_unlock(&monc->mutex);
+       if (req) {
+               complete_all(&req->completion);
+               put_generic_request(req);
+       }
+       return;
+
+bad:
+       pr_err("corrupt generic reply, tid %llu\n", tid);
+       ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_statfs *h;
+       int err;
+
+       req = kzalloc(sizeof(*req), GFP_NOFS);
+       if (!req)
+               return -ENOMEM;
+
+       kref_init(&req->kref);
+       req->buf = buf;
+       req->buf_len = sizeof(*buf);
+       init_completion(&req->completion);
+
+       err = -ENOMEM;
+       req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+       if (!req->request)
+               goto out;
+       req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+       if (!req->reply)
+               goto out;
+
+       /* fill out request */
+       h = req->request->front.iov_base;
+       h->monhdr.have_version = 0;
+       h->monhdr.session_mon = cpu_to_le16(-1);
+       h->monhdr.session_mon_tid = 0;
+       h->fsid = monc->monmap->fsid;
+
+       err = do_generic_request(monc, req);
+
+out:
+       kref_put(&req->kref, release_generic_request);
+       return err;
+}
+EXPORT_SYMBOL(ceph_monc_do_statfs);
+
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+                               char *dst, size_t dst_len)
+{
+       u32 buf_len;
+
+       if (src_len != sizeof(u32) + dst_len)
+               return -EINVAL;
+
+       buf_len = le32_to_cpu(*(u32 *)src);
+       if (buf_len != dst_len)
+               return -EINVAL;
+
+       memcpy(dst, src + sizeof(u32), dst_len);
+       return 0;
+}
+
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+                               struct ceph_msg *msg)
+{
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+       u64 tid = le64_to_cpu(msg->hdr.tid);
+
+       if (msg->front.iov_len < sizeof(*reply))
+               goto bad;
+       dout("handle_poolop_reply %p tid %llu\n", msg, tid);
+
+       mutex_lock(&monc->mutex);
+       req = __lookup_generic_req(monc, tid);
+       if (req) {
+               if (req->buf_len &&
+                   get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+                                    msg->front.iov_len - sizeof(*reply),
+                                    req->buf, req->buf_len) < 0) {
+                       mutex_unlock(&monc->mutex);
+                       goto bad;
+               }
+               req->result = le32_to_cpu(reply->reply_code);
+               get_generic_request(req);
+       }
+       mutex_unlock(&monc->mutex);
+       if (req) {
+               complete(&req->completion);
+               put_generic_request(req);
+       }
+       return;
+
+bad:
+       pr_err("corrupt generic reply, tid %llu\n", tid);
+       ceph_msg_dump(msg);
+}
+
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+                       u32 pool, u64 snapid,
+                       char *buf, int len)
+{
+       struct ceph_mon_generic_request *req;
+       struct ceph_mon_poolop *h;
+       int err;
+
+       req = kzalloc(sizeof(*req), GFP_NOFS);
+       if (!req)
+               return -ENOMEM;
+
+       kref_init(&req->kref);
+       req->buf = buf;
+       req->buf_len = len;
+       init_completion(&req->completion);
+
+       err = -ENOMEM;
+       req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+       if (!req->request)
+               goto out;
+       req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+       if (!req->reply)
+               goto out;
+
+       /* fill out request */
+       req->request->hdr.version = cpu_to_le16(2);
+       h = req->request->front.iov_base;
+       h->monhdr.have_version = 0;
+       h->monhdr.session_mon = cpu_to_le16(-1);
+       h->monhdr.session_mon_tid = 0;
+       h->fsid = monc->monmap->fsid;
+       h->pool = cpu_to_le32(pool);
+       h->op = cpu_to_le32(op);
+       h->auid = 0;
+       h->snapid = cpu_to_le64(snapid);
+       h->name_len = 0;
+
+       err = do_generic_request(monc, req);
+
+out:
+       kref_put(&req->kref, release_generic_request);
+       return err;
+}
+
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                           u32 pool, u64 *snapid)
+{
+       return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                  pool, 0, (char *)snapid, sizeof(*snapid));
+
+}
+EXPORT_SYMBOL(ceph_monc_create_snapid);
+
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                           u32 pool, u64 snapid)
+{
+       return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                  pool, snapid, 0, 0);
+
+}
+
+/*
+ * Resend pending generic requests.
+ */
+static void __resend_generic_request(struct ceph_mon_client *monc)
+{
+       struct ceph_mon_generic_request *req;
+       struct rb_node *p;
+
+       for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+               req = rb_entry(p, struct ceph_mon_generic_request, node);
+               ceph_con_revoke(monc->con, req->request);
+               ceph_con_send(monc->con, ceph_msg_get(req->request));
+       }
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+       struct ceph_mon_client *monc =
+               container_of(work, struct ceph_mon_client, delayed_work.work);
+
+       dout("monc delayed_work\n");
+       mutex_lock(&monc->mutex);
+       if (monc->hunting) {
+               __close_session(monc);
+               __open_session(monc);  /* continue hunting */
+       } else {
+               ceph_con_keepalive(monc->con);
+
+               __validate_auth(monc);
+
+               if (monc->auth->ops->is_authenticated(monc->auth))
+                       __send_subscribe(monc);
+       }
+       __schedule_delayed(monc);
+       mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+       struct ceph_options *opt = monc->client->options;
+       struct ceph_entity_addr *mon_addr = opt->mon_addr;
+       int num_mon = opt->num_mon;
+       int i;
+
+       /* build initial monmap */
+       monc->monmap = kzalloc(sizeof(*monc->monmap) +
+                              num_mon*sizeof(monc->monmap->mon_inst[0]),
+                              GFP_KERNEL);
+       if (!monc->monmap)
+               return -ENOMEM;
+       for (i = 0; i < num_mon; i++) {
+               monc->monmap->mon_inst[i].addr = mon_addr[i];
+               monc->monmap->mon_inst[i].addr.nonce = 0;
+               monc->monmap->mon_inst[i].name.type =
+                       CEPH_ENTITY_TYPE_MON;
+               monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+       }
+       monc->monmap->num_mon = num_mon;
+       monc->have_fsid = false;
+       return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+       int err = 0;
+
+       dout("init\n");
+       memset(monc, 0, sizeof(*monc));
+       monc->client = cl;
+       monc->monmap = NULL;
+       mutex_init(&monc->mutex);
+
+       err = build_initial_monmap(monc);
+       if (err)
+               goto out;
+
+       monc->con = NULL;
+
+       /* authentication */
+       monc->auth = ceph_auth_init(cl->options->name,
+                                   cl->options->secret);
+       if (IS_ERR(monc->auth))
+               return PTR_ERR(monc->auth);
+       monc->auth->want_keys =
+               CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+               CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+       /* msgs */
+       err = -ENOMEM;
+       monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+                                    sizeof(struct ceph_mon_subscribe_ack),
+                                    GFP_NOFS);
+       if (!monc->m_subscribe_ack)
+               goto out_monmap;
+
+       monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+       if (!monc->m_subscribe)
+               goto out_subscribe_ack;
+
+       monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+       if (!monc->m_auth_reply)
+               goto out_subscribe;
+
+       monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
+       monc->pending_auth = 0;
+       if (!monc->m_auth)
+               goto out_auth_reply;
+
+       monc->cur_mon = -1;
+       monc->hunting = true;
+       monc->sub_renew_after = jiffies;
+       monc->sub_sent = 0;
+
+       INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+       monc->generic_request_tree = RB_ROOT;
+       monc->num_generic_requests = 0;
+       monc->last_tid = 0;
+
+       monc->have_mdsmap = 0;
+       monc->have_osdmap = 0;
+       monc->want_next_osdmap = 1;
+       return 0;
+
+out_auth_reply:
+       ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+       ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+       ceph_msg_put(monc->m_subscribe_ack);
+out_monmap:
+       kfree(monc->monmap);
+out:
+       return err;
+}
+EXPORT_SYMBOL(ceph_monc_init);
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+       dout("stop\n");
+       cancel_delayed_work_sync(&monc->delayed_work);
+
+       mutex_lock(&monc->mutex);
+       __close_session(monc);
+       if (monc->con) {
+               monc->con->private = NULL;
+               monc->con->ops->put(monc->con);
+               monc->con = NULL;
+       }
+       mutex_unlock(&monc->mutex);
+
+       ceph_auth_destroy(monc->auth);
+
+       ceph_msg_put(monc->m_auth);
+       ceph_msg_put(monc->m_auth_reply);
+       ceph_msg_put(monc->m_subscribe);
+       ceph_msg_put(monc->m_subscribe_ack);
+
+       kfree(monc->monmap);
+}
+EXPORT_SYMBOL(ceph_monc_stop);
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+                             struct ceph_msg *msg)
+{
+       int ret;
+       int was_auth = 0;
+
+       mutex_lock(&monc->mutex);
+       if (monc->auth->ops)
+               was_auth = monc->auth->ops->is_authenticated(monc->auth);
+       monc->pending_auth = 0;
+       ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+                                    msg->front.iov_len,
+                                    monc->m_auth->front.iov_base,
+                                    monc->m_auth->front_max);
+       if (ret < 0) {
+               monc->client->auth_err = ret;
+               wake_up_all(&monc->client->auth_wq);
+       } else if (ret > 0) {
+               __send_prepared_auth_request(monc, ret);
+       } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+               dout("authenticated, starting session\n");
+
+               monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+               monc->client->msgr->inst.name.num =
+                                       cpu_to_le64(monc->auth->global_id);
+
+               __send_subscribe(monc);
+               __resend_generic_request(monc);
+       }
+       mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+       int ret;
+
+       if (monc->pending_auth)
+               return 0;
+
+       ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+                             monc->m_auth->front_max);
+       if (ret <= 0)
+               return ret; /* either an error, or no need to authenticate */
+       __send_prepared_auth_request(monc, ret);
+       return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+       int ret;
+
+       mutex_lock(&monc->mutex);
+       ret = __validate_auth(monc);
+       mutex_unlock(&monc->mutex);
+       return ret;
+}
+EXPORT_SYMBOL(ceph_monc_validate_auth);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       struct ceph_mon_client *monc = con->private;
+       int type = le16_to_cpu(msg->hdr.type);
+
+       if (!monc)
+               return;
+
+       switch (type) {
+       case CEPH_MSG_AUTH_REPLY:
+               handle_auth_reply(monc, msg);
+               break;
+
+       case CEPH_MSG_MON_SUBSCRIBE_ACK:
+               handle_subscribe_ack(monc, msg);
+               break;
+
+       case CEPH_MSG_STATFS_REPLY:
+               handle_statfs_reply(monc, msg);
+               break;
+
+       case CEPH_MSG_POOLOP_REPLY:
+               handle_poolop_reply(monc, msg);
+               break;
+
+       case CEPH_MSG_MON_MAP:
+               ceph_monc_handle_map(monc, msg);
+               break;
+
+       case CEPH_MSG_OSD_MAP:
+               ceph_osdc_handle_map(&monc->client->osdc, msg);
+               break;
+
+       default:
+               /* can the chained handler handle it? */
+               if (monc->client->extra_mon_dispatch &&
+                   monc->client->extra_mon_dispatch(monc->client, msg) == 0)
+                       break;
+                       
+               pr_err("received unknown message type %d %s\n", type,
+                      ceph_msg_type_name(type));
+       }
+       ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+                                     struct ceph_msg_header *hdr,
+                                     int *skip)
+{
+       struct ceph_mon_client *monc = con->private;
+       int type = le16_to_cpu(hdr->type);
+       int front_len = le32_to_cpu(hdr->front_len);
+       struct ceph_msg *m = NULL;
+
+       *skip = 0;
+
+       switch (type) {
+       case CEPH_MSG_MON_SUBSCRIBE_ACK:
+               m = ceph_msg_get(monc->m_subscribe_ack);
+               break;
+       case CEPH_MSG_POOLOP_REPLY:
+       case CEPH_MSG_STATFS_REPLY:
+               return get_generic_reply(con, hdr, skip);
+       case CEPH_MSG_AUTH_REPLY:
+               m = ceph_msg_get(monc->m_auth_reply);
+               break;
+       case CEPH_MSG_MON_MAP:
+       case CEPH_MSG_MDS_MAP:
+       case CEPH_MSG_OSD_MAP:
+               m = ceph_msg_new(type, front_len, GFP_NOFS);
+               break;
+       }
+
+       if (!m) {
+               pr_info("alloc_msg unknown type %d\n", type);
+               *skip = 1;
+       }
+       return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+       struct ceph_mon_client *monc = con->private;
+
+       if (!monc)
+               return;
+
+       dout("mon_fault\n");
+       mutex_lock(&monc->mutex);
+       if (!con->private)
+               goto out;
+
+       if (monc->con && !monc->hunting)
+               pr_info("mon%d %s session lost, "
+                       "hunting for new mon\n", monc->cur_mon,
+                       ceph_pr_addr(&monc->con->peer_addr.in_addr));
+
+       __close_session(monc);
+       if (!monc->hunting) {
+               /* start hunting */
+               monc->hunting = true;
+               __open_session(monc);
+       } else {
+               /* already hunting, let's wait a bit */
+               __schedule_delayed(monc);
+       }
+out:
+       mutex_unlock(&monc->mutex);
+}
+
+static const struct ceph_connection_operations mon_con_ops = {
+       .get = ceph_con_get,
+       .put = ceph_con_put,
+       .dispatch = dispatch,
+       .fault = mon_fault,
+       .alloc_msg = mon_alloc_msg,
+};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c

new file mode 100644 (file)

index 0000000..d5f2d97
--- /dev/null
+++ b/net/ceph/msgpool.c
@@ -0,0 +1,64 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <linux/ceph/msgpool.h>
+
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+       struct ceph_msgpool *pool = arg;
+       void *p;
+
+       p = ceph_msg_new(0, pool->front_len, gfp_mask);
+       if (!p)
+               pr_err("msgpool %s alloc failed\n", pool->name);
+       return p;
+}
+
+static void free_fn(void *element, void *arg)
+{
+       ceph_msg_put(element);
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+                     int front_len, int size, bool blocking, const char *name)
+{
+       pool->front_len = front_len;
+       pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+       if (!pool->pool)
+               return -ENOMEM;
+       pool->name = name;
+       return 0;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+       mempool_destroy(pool->pool);
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                 int front_len)
+{
+       if (front_len > pool->front_len) {
+               pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+                      pool->name, front_len, pool->front_len);
+               WARN_ON(1);
+
+               /* try to alloc a fresh message */
+               return ceph_msg_new(0, front_len, GFP_NOFS);
+       }
+
+       return mempool_alloc(pool->pool, GFP_NOFS);
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+       /* reset msg front_len; user may have changed it */
+       msg->front.iov_len = pool->front_len;
+       msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+       kref_init(&msg->kref);  /* retake single ref */
+}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c

new file mode 100644 (file)

index 0000000..7939199
--- /dev/null
+++ b/net/ceph/osd_client.c
@@ -0,0 +1,1773 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#ifdef CONFIG_BLOCK
+#include <linux/bio.h>
+#endif
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+#define OSD_OP_FRONT_LEN       4096
+#define OSD_OPREPLY_FRONT_LEN  512
+
+static const struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+                         struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+static int op_needs_trail(int op)
+{
+       switch (op) {
+       case CEPH_OSD_OP_GETXATTR:
+       case CEPH_OSD_OP_SETXATTR:
+       case CEPH_OSD_OP_CMPXATTR:
+       case CEPH_OSD_OP_CALL:
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+static int op_has_extent(int op)
+{
+       return (op == CEPH_OSD_OP_READ ||
+               op == CEPH_OSD_OP_WRITE);
+}
+
+void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
+                       struct ceph_file_layout *layout,
+                       u64 snapid,
+                       u64 off, u64 *plen, u64 *bno,
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op)
+{
+       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+       u64 orig_len = *plen;
+       u64 objoff, objlen;    /* extent in object */
+
+       reqhead->snapid = cpu_to_le64(snapid);
+
+       /* object extent? */
+       ceph_calc_file_object_mapping(layout, off, plen, bno,
+                                     &objoff, &objlen);
+       if (*plen < orig_len)
+               dout(" skipping last %llu, final file extent %llu~%llu\n",
+                    orig_len - *plen, off, *plen);
+
+       if (op_has_extent(op->op)) {
+               op->extent.offset = objoff;
+               op->extent.length = objlen;
+       }
+       req->r_num_pages = calc_pages_for(off, *plen);
+       if (op->op == CEPH_OSD_OP_WRITE)
+               op->payload_len = *plen;
+
+       dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+            *bno, objoff, objlen, req->r_num_pages);
+
+}
+EXPORT_SYMBOL(ceph_calc_raw_layout);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+                       struct ceph_vino vino,
+                       struct ceph_file_layout *layout,
+                       u64 off, u64 *plen,
+                       struct ceph_osd_request *req,
+                       struct ceph_osd_req_op *op)
+{
+       u64 bno;
+
+       ceph_calc_raw_layout(osdc, layout, vino.snap, off,
+                            plen, &bno, req, op);
+
+       sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+       req->r_oid_len = strlen(req->r_oid);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+       struct ceph_osd_request *req = container_of(kref,
+                                                   struct ceph_osd_request,
+                                                   r_kref);
+
+       if (req->r_request)
+               ceph_msg_put(req->r_request);
+       if (req->r_reply)
+               ceph_msg_put(req->r_reply);
+       if (req->r_con_filling_msg) {
+               dout("release_request revoking pages %p from con %p\n",
+                    req->r_pages, req->r_con_filling_msg);
+               ceph_con_revoke_message(req->r_con_filling_msg,
+                                     req->r_reply);
+               ceph_con_put(req->r_con_filling_msg);
+       }
+       if (req->r_own_pages)
+               ceph_release_page_vector(req->r_pages,
+                                        req->r_num_pages);
+#ifdef CONFIG_BLOCK
+       if (req->r_bio)
+               bio_put(req->r_bio);
+#endif
+       ceph_put_snap_context(req->r_snapc);
+       if (req->r_trail) {
+               ceph_pagelist_release(req->r_trail);
+               kfree(req->r_trail);
+       }
+       if (req->r_mempool)
+               mempool_free(req, req->r_osdc->req_mempool);
+       else
+               kfree(req);
+}
+EXPORT_SYMBOL(ceph_osdc_release_request);
+
+static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
+{
+       int i = 0;
+
+       if (needs_trail)
+               *needs_trail = 0;
+       while (ops[i].op) {
+               if (needs_trail && op_needs_trail(ops[i].op))
+                       *needs_trail = 1;
+               i++;
+       }
+
+       return i;
+}
+
+struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+                                              int flags,
+                                              struct ceph_snap_context *snapc,
+                                              struct ceph_osd_req_op *ops,
+                                              bool use_mempool,
+                                              gfp_t gfp_flags,
+                                              struct page **pages,
+                                              struct bio *bio)
+{
+       struct ceph_osd_request *req;
+       struct ceph_msg *msg;
+       int needs_trail;
+       int num_op = get_num_ops(ops, &needs_trail);
+       size_t msg_size = sizeof(struct ceph_osd_request_head);
+
+       msg_size += num_op*sizeof(struct ceph_osd_op);
+
+       if (use_mempool) {
+               req = mempool_alloc(osdc->req_mempool, gfp_flags);
+               memset(req, 0, sizeof(*req));
+       } else {
+               req = kzalloc(sizeof(*req), gfp_flags);
+       }
+       if (req == NULL)
+               return NULL;
+
+       req->r_osdc = osdc;
+       req->r_mempool = use_mempool;
+
+       kref_init(&req->r_kref);
+       init_completion(&req->r_completion);
+       init_completion(&req->r_safe_completion);
+       INIT_LIST_HEAD(&req->r_unsafe_item);
+       req->r_flags = flags;
+
+       WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+       /* create reply message */
+       if (use_mempool)
+               msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+       else
+               msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+                                  OSD_OPREPLY_FRONT_LEN, gfp_flags);
+       if (!msg) {
+               ceph_osdc_put_request(req);
+               return NULL;
+       }
+       req->r_reply = msg;
+
+       /* allocate space for the trailing data */
+       if (needs_trail) {
+               req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
+               if (!req->r_trail) {
+                       ceph_osdc_put_request(req);
+                       return NULL;
+               }
+               ceph_pagelist_init(req->r_trail);
+       }
+       /* create request message; allow space for oid */
+       msg_size += 40;
+       if (snapc)
+               msg_size += sizeof(u64) * snapc->num_snaps;
+       if (use_mempool)
+               msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+       else
+               msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
+       if (!msg) {
+               ceph_osdc_put_request(req);
+               return NULL;
+       }
+
+       msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+       memset(msg->front.iov_base, 0, msg->front.iov_len);
+
+       req->r_request = msg;
+       req->r_pages = pages;
+#ifdef CONFIG_BLOCK
+       if (bio) {
+               req->r_bio = bio;
+               bio_get(req->r_bio);
+       }
+#endif
+
+       return req;
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_request);
+
+static void osd_req_encode_op(struct ceph_osd_request *req,
+                             struct ceph_osd_op *dst,
+                             struct ceph_osd_req_op *src)
+{
+       dst->op = cpu_to_le16(src->op);
+
+       switch (dst->op) {
+       case CEPH_OSD_OP_READ:
+       case CEPH_OSD_OP_WRITE:
+               dst->extent.offset =
+                       cpu_to_le64(src->extent.offset);
+               dst->extent.length =
+                       cpu_to_le64(src->extent.length);
+               dst->extent.truncate_size =
+                       cpu_to_le64(src->extent.truncate_size);
+               dst->extent.truncate_seq =
+                       cpu_to_le32(src->extent.truncate_seq);
+               break;
+
+       case CEPH_OSD_OP_GETXATTR:
+       case CEPH_OSD_OP_SETXATTR:
+       case CEPH_OSD_OP_CMPXATTR:
+               BUG_ON(!req->r_trail);
+
+               dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
+               dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
+               dst->xattr.cmp_op = src->xattr.cmp_op;
+               dst->xattr.cmp_mode = src->xattr.cmp_mode;
+               ceph_pagelist_append(req->r_trail, src->xattr.name,
+                                    src->xattr.name_len);
+               ceph_pagelist_append(req->r_trail, src->xattr.val,
+                                    src->xattr.value_len);
+               break;
+       case CEPH_OSD_OP_CALL:
+               BUG_ON(!req->r_trail);
+
+               dst->cls.class_len = src->cls.class_len;
+               dst->cls.method_len = src->cls.method_len;
+               dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+
+               ceph_pagelist_append(req->r_trail, src->cls.class_name,
+                                    src->cls.class_len);
+               ceph_pagelist_append(req->r_trail, src->cls.method_name,
+                                    src->cls.method_len);
+               ceph_pagelist_append(req->r_trail, src->cls.indata,
+                                    src->cls.indata_len);
+               break;
+       case CEPH_OSD_OP_ROLLBACK:
+               dst->snap.snapid = cpu_to_le64(src->snap.snapid);
+               break;
+       case CEPH_OSD_OP_STARTSYNC:
+               break;
+       default:
+               pr_err("unrecognized osd opcode %d\n", dst->op);
+               WARN_ON(1);
+               break;
+       }
+       dst->payload_len = cpu_to_le32(src->payload_len);
+}
+
+/*
+ * build new request AND message
+ *
+ */
+void ceph_osdc_build_request(struct ceph_osd_request *req,
+                            u64 off, u64 *plen,
+                            struct ceph_osd_req_op *src_ops,
+                            struct ceph_snap_context *snapc,
+                            struct timespec *mtime,
+                            const char *oid,
+                            int oid_len)
+{
+       struct ceph_msg *msg = req->r_request;
+       struct ceph_osd_request_head *head;
+       struct ceph_osd_req_op *src_op;
+       struct ceph_osd_op *op;
+       void *p;
+       int num_op = get_num_ops(src_ops, NULL);
+       size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+       int flags = req->r_flags;
+       u64 data_len = 0;
+       int i;
+
+       head = msg->front.iov_base;
+       op = (void *)(head + 1);
+       p = (void *)(op + num_op);
+
+       req->r_snapc = ceph_get_snap_context(snapc);
+
+       head->client_inc = cpu_to_le32(1); /* always, for now. */
+       head->flags = cpu_to_le32(flags);
+       if (flags & CEPH_OSD_FLAG_WRITE)
+               ceph_encode_timespec(&head->mtime, mtime);
+       head->num_ops = cpu_to_le16(num_op);
+
+
+       /* fill in oid */
+       head->object_len = cpu_to_le32(oid_len);
+       memcpy(p, oid, oid_len);
+       p += oid_len;
+
+       src_op = src_ops;
+       while (src_op->op) {
+               osd_req_encode_op(req, op, src_op);
+               src_op++;
+               op++;
+       }
+
+       if (req->r_trail)
+               data_len += req->r_trail->length;
+
+       if (snapc) {
+               head->snap_seq = cpu_to_le64(snapc->seq);
+               head->num_snaps = cpu_to_le32(snapc->num_snaps);
+               for (i = 0; i < snapc->num_snaps; i++) {
+                       put_unaligned_le64(snapc->snaps[i], p);
+                       p += sizeof(u64);
+               }
+       }
+
+       if (flags & CEPH_OSD_FLAG_WRITE) {
+               req->r_request->hdr.data_off = cpu_to_le16(off);
+               req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+       } else if (data_len) {
+               req->r_request->hdr.data_off = 0;
+               req->r_request->hdr.data_len = cpu_to_le32(data_len);
+       }
+
+       BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+       msg_size = p - msg->front.iov_base;
+       msg->front.iov_len = msg_size;
+       msg->hdr.front_len = cpu_to_le32(msg_size);
+       return;
+}
+EXPORT_SYMBOL(ceph_osdc_build_request);
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+                                              struct ceph_file_layout *layout,
+                                              struct ceph_vino vino,
+                                              u64 off, u64 *plen,
+                                              int opcode, int flags,
+                                              struct ceph_snap_context *snapc,
+                                              int do_sync,
+                                              u32 truncate_seq,
+                                              u64 truncate_size,
+                                              struct timespec *mtime,
+                                              bool use_mempool, int num_reply)
+{
+       struct ceph_osd_req_op ops[3];
+       struct ceph_osd_request *req;
+
+       ops[0].op = opcode;
+       ops[0].extent.truncate_seq = truncate_seq;
+       ops[0].extent.truncate_size = truncate_size;
+       ops[0].payload_len = 0;
+
+       if (do_sync) {
+               ops[1].op = CEPH_OSD_OP_STARTSYNC;
+               ops[1].payload_len = 0;
+               ops[2].op = 0;
+       } else
+               ops[1].op = 0;
+
+       req = ceph_osdc_alloc_request(osdc, flags,
+                                        snapc, ops,
+                                        use_mempool,
+                                        GFP_NOFS, NULL, NULL);
+       if (IS_ERR(req))
+               return req;
+
+       /* calculate max write size */
+       calc_layout(osdc, vino, layout, off, plen, req, ops);
+       req->r_file_layout = *layout;  /* keep a copy */
+
+       ceph_osdc_build_request(req, off, plen, ops,
+                               snapc,
+                               mtime,
+                               req->r_oid, req->r_oid_len);
+
+       return req;
+}
+EXPORT_SYMBOL(ceph_osdc_new_request);
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *new)
+{
+       struct rb_node **p = &osdc->requests.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_osd_request *req = NULL;
+
+       while (*p) {
+               parent = *p;
+               req = rb_entry(parent, struct ceph_osd_request, r_node);
+               if (new->r_tid < req->r_tid)
+                       p = &(*p)->rb_left;
+               else if (new->r_tid > req->r_tid)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&new->r_node, parent, p);
+       rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+                                                u64 tid)
+{
+       struct ceph_osd_request *req;
+       struct rb_node *n = osdc->requests.rb_node;
+
+       while (n) {
+               req = rb_entry(n, struct ceph_osd_request, r_node);
+               if (tid < req->r_tid)
+                       n = n->rb_left;
+               else if (tid > req->r_tid)
+                       n = n->rb_right;
+               else
+                       return req;
+       }
+       return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+                   u64 tid)
+{
+       struct ceph_osd_request *req;
+       struct rb_node *n = osdc->requests.rb_node;
+
+       while (n) {
+               req = rb_entry(n, struct ceph_osd_request, r_node);
+               if (tid < req->r_tid) {
+                       if (!n->rb_left)
+                               return req;
+                       n = n->rb_left;
+               } else if (tid > req->r_tid) {
+                       n = n->rb_right;
+               } else {
+                       return req;
+               }
+       }
+       return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+       struct ceph_osd *osd = con->private;
+       struct ceph_osd_client *osdc;
+
+       if (!osd)
+               return;
+       dout("osd_reset osd%d\n", osd->o_osd);
+       osdc = osd->o_osdc;
+       down_read(&osdc->map_sem);
+       kick_requests(osdc, osd);
+       up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+       struct ceph_osd *osd;
+
+       osd = kzalloc(sizeof(*osd), GFP_NOFS);
+       if (!osd)
+               return NULL;
+
+       atomic_set(&osd->o_ref, 1);
+       osd->o_osdc = osdc;
+       INIT_LIST_HEAD(&osd->o_requests);
+       INIT_LIST_HEAD(&osd->o_osd_lru);
+       osd->o_incarnation = 1;
+
+       ceph_con_init(osdc->client->msgr, &osd->o_con);
+       osd->o_con.private = osd;
+       osd->o_con.ops = &osd_con_ops;
+       osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+       INIT_LIST_HEAD(&osd->o_keepalive_item);
+       return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+       if (atomic_inc_not_zero(&osd->o_ref)) {
+               dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+                    atomic_read(&osd->o_ref));
+               return osd;
+       } else {
+               dout("get_osd %p FAIL\n", osd);
+               return NULL;
+       }
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+       dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+            atomic_read(&osd->o_ref) - 1);
+       if (atomic_dec_and_test(&osd->o_ref)) {
+               struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+
+               if (osd->o_authorizer)
+                       ac->ops->destroy_authorizer(ac, osd->o_authorizer);
+               kfree(osd);
+       }
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+       dout("__remove_osd %p\n", osd);
+       BUG_ON(!list_empty(&osd->o_requests));
+       rb_erase(&osd->o_node, &osdc->osds);
+       list_del_init(&osd->o_osd_lru);
+       ceph_con_close(&osd->o_con);
+       put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+                             struct ceph_osd *osd)
+{
+       dout("__move_osd_to_lru %p\n", osd);
+       BUG_ON(!list_empty(&osd->o_osd_lru));
+       list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+       osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+       dout("__remove_osd_from_lru %p\n", osd);
+       if (!list_empty(&osd->o_osd_lru))
+               list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+       struct ceph_osd *osd, *nosd;
+
+       dout("__remove_old_osds %p\n", osdc);
+       mutex_lock(&osdc->request_mutex);
+       list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+               if (!remove_all && time_before(jiffies, osd->lru_ttl))
+                       break;
+               __remove_osd(osdc, osd);
+       }
+       mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+       struct ceph_osd_request *req;
+       int ret = 0;
+
+       dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+       if (list_empty(&osd->o_requests)) {
+               __remove_osd(osdc, osd);
+       } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+                         &osd->o_con.peer_addr,
+                         sizeof(osd->o_con.peer_addr)) == 0 &&
+                  !ceph_con_opened(&osd->o_con)) {
+               dout(" osd addr hasn't changed and connection never opened,"
+                    " letting msgr retry");
+               /* touch each r_stamp for handle_timeout()'s benfit */
+               list_for_each_entry(req, &osd->o_requests, r_osd_item)
+                       req->r_stamp = jiffies;
+               ret = -EAGAIN;
+       } else {
+               ceph_con_close(&osd->o_con);
+               ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+               osd->o_incarnation++;
+       }
+       return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+       struct rb_node **p = &osdc->osds.rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_osd *osd = NULL;
+
+       while (*p) {
+               parent = *p;
+               osd = rb_entry(parent, struct ceph_osd, o_node);
+               if (new->o_osd < osd->o_osd)
+                       p = &(*p)->rb_left;
+               else if (new->o_osd > osd->o_osd)
+                       p = &(*p)->rb_right;
+               else
+                       BUG();
+       }
+
+       rb_link_node(&new->o_node, parent, p);
+       rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+       struct ceph_osd *osd;
+       struct rb_node *n = osdc->osds.rb_node;
+
+       while (n) {
+               osd = rb_entry(n, struct ceph_osd, o_node);
+               if (o < osd->o_osd)
+                       n = n->rb_left;
+               else if (o > osd->o_osd)
+                       n = n->rb_right;
+               else
+                       return osd;
+       }
+       return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+       schedule_delayed_work(&osdc->timeout_work,
+                       osdc->client->options->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+       cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+                            struct ceph_osd_request *req)
+{
+       mutex_lock(&osdc->request_mutex);
+       req->r_tid = ++osdc->last_tid;
+       req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+       INIT_LIST_HEAD(&req->r_req_lru_item);
+
+       dout("register_request %p tid %lld\n", req, req->r_tid);
+       __insert_request(osdc, req);
+       ceph_osdc_get_request(req);
+       osdc->num_requests++;
+
+       if (osdc->num_requests == 1) {
+               dout(" first request, scheduling timeout\n");
+               __schedule_osd_timeout(osdc);
+       }
+       mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+                                struct ceph_osd_request *req)
+{
+       dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+       rb_erase(&req->r_node, &osdc->requests);
+       osdc->num_requests--;
+
+       if (req->r_osd) {
+               /* make sure the original request isn't in flight. */
+               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+               list_del_init(&req->r_osd_item);
+               if (list_empty(&req->r_osd->o_requests))
+                       __move_osd_to_lru(osdc, req->r_osd);
+               req->r_osd = NULL;
+       }
+
+       ceph_osdc_put_request(req);
+
+       list_del_init(&req->r_req_lru_item);
+       if (osdc->num_requests == 0) {
+               dout(" no requests, canceling timeout\n");
+               __cancel_osd_timeout(osdc);
+       }
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+       if (req->r_sent && req->r_osd) {
+               ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+               req->r_sent = 0;
+       }
+       list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+                     struct ceph_osd_request *req)
+{
+       struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+       struct ceph_pg pgid;
+       int acting[CEPH_PG_MAX_SIZE];
+       int o = -1, num = 0;
+       int err;
+
+       dout("map_osds %p tid %lld\n", req, req->r_tid);
+       err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+                                     &req->r_file_layout, osdc->osdmap);
+       if (err)
+               return err;
+       pgid = reqhead->layout.ol_pgid;
+       req->r_pgid = pgid;
+
+       err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+       if (err > 0) {
+               o = acting[0];
+               num = err;
+       }
+
+       if ((req->r_osd && req->r_osd->o_osd == o &&
+            req->r_sent >= req->r_osd->o_incarnation &&
+            req->r_num_pg_osds == num &&
+            memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
+           (req->r_osd == NULL && o == -1))
+               return 0;  /* no change */
+
+       dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+            req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+            req->r_osd ? req->r_osd->o_osd : -1);
+
+       /* record full pg acting set */
+       memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+       req->r_num_pg_osds = num;
+
+       if (req->r_osd) {
+               __cancel_request(req);
+               list_del_init(&req->r_osd_item);
+               req->r_osd = NULL;
+       }
+
+       req->r_osd = __lookup_osd(osdc, o);
+       if (!req->r_osd && o >= 0) {
+               err = -ENOMEM;
+               req->r_osd = create_osd(osdc);
+               if (!req->r_osd)
+                       goto out;
+
+               dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+               req->r_osd->o_osd = o;
+               req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+               __insert_osd(osdc, req->r_osd);
+
+               ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+       }
+
+       if (req->r_osd) {
+               __remove_osd_from_lru(req->r_osd);
+               list_add(&req->r_osd_item, &req->r_osd->o_requests);
+       }
+       err = 1;   /* osd or pg changed */
+
+out:
+       return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+                         struct ceph_osd_request *req)
+{
+       struct ceph_osd_request_head *reqhead;
+       int err;
+
+       err = __map_osds(osdc, req);
+       if (err < 0)
+               return err;
+       if (req->r_osd == NULL) {
+               dout("send_request %p no up osds in pg\n", req);
+               ceph_monc_request_next_osdmap(&osdc->client->monc);
+               return 0;
+       }
+
+       dout("send_request %p tid %llu to osd%d flags %d\n",
+            req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+       reqhead = req->r_request->front.iov_base;
+       reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+       reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+       reqhead->reassert_version = req->r_reassert_version;
+
+       req->r_stamp = jiffies;
+       list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
+
+       ceph_msg_get(req->r_request); /* send consumes a ref */
+       ceph_con_send(&req->r_osd->o_con, req->r_request);
+       req->r_sent = req->r_osd->o_incarnation;
+       return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+       struct ceph_osd_client *osdc =
+               container_of(work, struct ceph_osd_client, timeout_work.work);
+       struct ceph_osd_request *req, *last_req = NULL;
+       struct ceph_osd *osd;
+       unsigned long timeout = osdc->client->options->osd_timeout * HZ;
+       unsigned long keepalive =
+               osdc->client->options->osd_keepalive_timeout * HZ;
+       unsigned long last_stamp = 0;
+       struct rb_node *p;
+       struct list_head slow_osds;
+
+       dout("timeout\n");
+       down_read(&osdc->map_sem);
+
+       ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+       mutex_lock(&osdc->request_mutex);
+       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+               req = rb_entry(p, struct ceph_osd_request, r_node);
+
+               if (req->r_resend) {
+                       int err;
+
+                       dout("osdc resending prev failed %lld\n", req->r_tid);
+                       err = __send_request(osdc, req);
+                       if (err)
+                               dout("osdc failed again on %lld\n", req->r_tid);
+                       else
+                               req->r_resend = false;
+                       continue;
+               }
+       }
+
+       /*
+        * reset osds that appear to be _really_ unresponsive.  this
+        * is a failsafe measure.. we really shouldn't be getting to
+        * this point if the system is working properly.  the monitors
+        * should mark the osd as failed and we should find out about
+        * it from an updated osd map.
+        */
+       while (timeout && !list_empty(&osdc->req_lru)) {
+               req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+                                r_req_lru_item);
+
+               if (time_before(jiffies, req->r_stamp + timeout))
+                       break;
+
+               BUG_ON(req == last_req && req->r_stamp == last_stamp);
+               last_req = req;
+               last_stamp = req->r_stamp;
+
+               osd = req->r_osd;
+               BUG_ON(!osd);
+               pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+                          req->r_tid, osd->o_osd);
+               __kick_requests(osdc, osd);
+       }
+
+       /*
+        * ping osds that are a bit slow.  this ensures that if there
+        * is a break in the TCP connection we will notice, and reopen
+        * a connection with that osd (from the fault callback).
+        */
+       INIT_LIST_HEAD(&slow_osds);
+       list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+               if (time_before(jiffies, req->r_stamp + keepalive))
+                       break;
+
+               osd = req->r_osd;
+               BUG_ON(!osd);
+               dout(" tid %llu is slow, will send keepalive on osd%d\n",
+                    req->r_tid, osd->o_osd);
+               list_move_tail(&osd->o_keepalive_item, &slow_osds);
+       }
+       while (!list_empty(&slow_osds)) {
+               osd = list_entry(slow_osds.next, struct ceph_osd,
+                                o_keepalive_item);
+               list_del_init(&osd->o_keepalive_item);
+               ceph_con_keepalive(&osd->o_con);
+       }
+
+       __schedule_osd_timeout(osdc);
+       mutex_unlock(&osdc->request_mutex);
+
+       up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+       struct ceph_osd_client *osdc =
+               container_of(work, struct ceph_osd_client,
+                            osds_timeout_work.work);
+       unsigned long delay =
+               osdc->client->options->osd_idle_ttl * HZ >> 2;
+
+       dout("osds timeout\n");
+       down_read(&osdc->map_sem);
+       remove_old_osds(osdc, 0);
+       up_read(&osdc->map_sem);
+
+       schedule_delayed_work(&osdc->osds_timeout_work,
+                             round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+                        struct ceph_connection *con)
+{
+       struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+       struct ceph_osd_request *req;
+       u64 tid;
+       int numops, object_len, flags;
+       s32 result;
+
+       tid = le64_to_cpu(msg->hdr.tid);
+       if (msg->front.iov_len < sizeof(*rhead))
+               goto bad;
+       numops = le32_to_cpu(rhead->num_ops);
+       object_len = le32_to_cpu(rhead->object_len);
+       result = le32_to_cpu(rhead->result);
+       if (msg->front.iov_len != sizeof(*rhead) + object_len +
+           numops * sizeof(struct ceph_osd_op))
+               goto bad;
+       dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
+
+       /* lookup */
+       mutex_lock(&osdc->request_mutex);
+       req = __lookup_request(osdc, tid);
+       if (req == NULL) {
+               dout("handle_reply tid %llu dne\n", tid);
+               mutex_unlock(&osdc->request_mutex);
+               return;
+       }
+       ceph_osdc_get_request(req);
+       flags = le32_to_cpu(rhead->flags);
+
+       /*
+        * if this connection filled our message, drop our reference now, to
+        * avoid a (safe but slower) revoke later.
+        */
+       if (req->r_con_filling_msg == con && req->r_reply == msg) {
+               dout(" dropping con_filling_msg ref %p\n", con);
+               req->r_con_filling_msg = NULL;
+               ceph_con_put(con);
+       }
+
+       if (!req->r_got_reply) {
+               unsigned bytes;
+
+               req->r_result = le32_to_cpu(rhead->result);
+               bytes = le32_to_cpu(msg->hdr.data_len);
+               dout("handle_reply result %d bytes %d\n", req->r_result,
+                    bytes);
+               if (req->r_result == 0)
+                       req->r_result = bytes;
+
+               /* in case this is a write and we need to replay, */
+               req->r_reassert_version = rhead->reassert_version;
+
+               req->r_got_reply = 1;
+       } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+               dout("handle_reply tid %llu dup ack\n", tid);
+               mutex_unlock(&osdc->request_mutex);
+               goto done;
+       }
+
+       dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+       /* either this is a read, or we got the safe response */
+       if (result < 0 ||
+           (flags & CEPH_OSD_FLAG_ONDISK) ||
+           ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+               __unregister_request(osdc, req);
+
+       mutex_unlock(&osdc->request_mutex);
+
+       if (req->r_callback)
+               req->r_callback(req, msg);
+       else
+               complete_all(&req->r_completion);
+
+       if (flags & CEPH_OSD_FLAG_ONDISK) {
+               if (req->r_safe_callback)
+                       req->r_safe_callback(req, msg);
+               complete_all(&req->r_safe_completion);  /* fsync waiter */
+       }
+
+done:
+       ceph_osdc_put_request(req);
+       return;
+
+bad:
+       pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+              (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+              (int)sizeof(*rhead));
+       ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+                         struct ceph_osd *kickosd)
+{
+       struct ceph_osd_request *req;
+       struct rb_node *p, *n;
+       int needmap = 0;
+       int err;
+
+       dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+       if (kickosd) {
+               err = __reset_osd(osdc, kickosd);
+               if (err == -EAGAIN)
+                       return 1;
+       } else {
+               for (p = rb_first(&osdc->osds); p; p = n) {
+                       struct ceph_osd *osd =
+                               rb_entry(p, struct ceph_osd, o_node);
+
+                       n = rb_next(p);
+                       if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+                           memcmp(&osd->o_con.peer_addr,
+                                  ceph_osd_addr(osdc->osdmap,
+                                                osd->o_osd),
+                                  sizeof(struct ceph_entity_addr)) != 0)
+                               __reset_osd(osdc, osd);
+               }
+       }
+
+       for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+               req = rb_entry(p, struct ceph_osd_request, r_node);
+
+               if (req->r_resend) {
+                       dout(" r_resend set on tid %llu\n", req->r_tid);
+                       __cancel_request(req);
+                       goto kick;
+               }
+               if (req->r_osd && kickosd == req->r_osd) {
+                       __cancel_request(req);
+                       goto kick;
+               }
+
+               err = __map_osds(osdc, req);
+               if (err == 0)
+                       continue;  /* no change */
+               if (err < 0) {
+                       /*
+                        * FIXME: really, we should set the request
+                        * error and fail if this isn't a 'nofail'
+                        * request, but that's a fair bit more
+                        * complicated to do.  So retry!
+                        */
+                       dout(" setting r_resend on %llu\n", req->r_tid);
+                       req->r_resend = true;
+                       continue;
+               }
+               if (req->r_osd == NULL) {
+                       dout("tid %llu maps to no valid osd\n", req->r_tid);
+                       needmap++;  /* request a newer map */
+                       continue;
+               }
+
+kick:
+               dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+                    req->r_osd ? req->r_osd->o_osd : -1);
+               req->r_flags |= CEPH_OSD_FLAG_RETRY;
+               err = __send_request(osdc, req);
+               if (err) {
+                       dout(" setting r_resend on %llu\n", req->r_tid);
+                       req->r_resend = true;
+               }
+       }
+
+       return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+                         struct ceph_osd *kickosd)
+{
+       int needmap;
+
+       mutex_lock(&osdc->request_mutex);
+       needmap = __kick_requests(osdc, kickosd);
+       mutex_unlock(&osdc->request_mutex);
+
+       if (needmap) {
+               dout("%d requests for down osds, need new map\n", needmap);
+               ceph_monc_request_next_osdmap(&osdc->client->monc);
+       }
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+       void *p, *end, *next;
+       u32 nr_maps, maplen;
+       u32 epoch;
+       struct ceph_osdmap *newmap = NULL, *oldmap;
+       int err;
+       struct ceph_fsid fsid;
+
+       dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+       p = msg->front.iov_base;
+       end = p + msg->front.iov_len;
+
+       /* verify fsid */
+       ceph_decode_need(&p, end, sizeof(fsid), bad);
+       ceph_decode_copy(&p, &fsid, sizeof(fsid));
+       if (ceph_check_fsid(osdc->client, &fsid) < 0)
+               return;
+
+       down_write(&osdc->map_sem);
+
+       /* incremental maps */
+       ceph_decode_32_safe(&p, end, nr_maps, bad);
+       dout(" %d inc maps\n", nr_maps);
+       while (nr_maps > 0) {
+               ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+               epoch = ceph_decode_32(&p);
+               maplen = ceph_decode_32(&p);
+               ceph_decode_need(&p, end, maplen, bad);
+               next = p + maplen;
+               if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+                       dout("applying incremental map %u len %d\n",
+                            epoch, maplen);
+                       newmap = osdmap_apply_incremental(&p, next,
+                                                         osdc->osdmap,
+                                                         osdc->client->msgr);
+                       if (IS_ERR(newmap)) {
+                               err = PTR_ERR(newmap);
+                               goto bad;
+                       }
+                       BUG_ON(!newmap);
+                       if (newmap != osdc->osdmap) {
+                               ceph_osdmap_destroy(osdc->osdmap);
+                               osdc->osdmap = newmap;
+                       }
+               } else {
+                       dout("ignoring incremental map %u len %d\n",
+                            epoch, maplen);
+               }
+               p = next;
+               nr_maps--;
+       }
+       if (newmap)
+               goto done;
+
+       /* full maps */
+       ceph_decode_32_safe(&p, end, nr_maps, bad);
+       dout(" %d full maps\n", nr_maps);
+       while (nr_maps) {
+               ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+               epoch = ceph_decode_32(&p);
+               maplen = ceph_decode_32(&p);
+               ceph_decode_need(&p, end, maplen, bad);
+               if (nr_maps > 1) {
+                       dout("skipping non-latest full map %u len %d\n",
+                            epoch, maplen);
+               } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+                       dout("skipping full map %u len %d, "
+                            "older than our %u\n", epoch, maplen,
+                            osdc->osdmap->epoch);
+               } else {
+                       dout("taking full map %u len %d\n", epoch, maplen);
+                       newmap = osdmap_decode(&p, p+maplen);
+                       if (IS_ERR(newmap)) {
+                               err = PTR_ERR(newmap);
+                               goto bad;
+                       }
+                       BUG_ON(!newmap);
+                       oldmap = osdc->osdmap;
+                       osdc->osdmap = newmap;
+                       if (oldmap)
+                               ceph_osdmap_destroy(oldmap);
+               }
+               p += maplen;
+               nr_maps--;
+       }
+
+done:
+       downgrade_write(&osdc->map_sem);
+       ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+       if (newmap)
+               kick_requests(osdc, NULL);
+       up_read(&osdc->map_sem);
+       wake_up_all(&osdc->client->auth_wq);
+       return;
+
+bad:
+       pr_err("osdc handle_map corrupt msg\n");
+       ceph_msg_dump(msg);
+       up_write(&osdc->map_sem);
+       return;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+                           struct ceph_osd_request *req,
+                           bool nofail)
+{
+       int rc = 0;
+
+       req->r_request->pages = req->r_pages;
+       req->r_request->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+       req->r_request->bio = req->r_bio;
+#endif
+       req->r_request->trail = req->r_trail;
+
+       register_request(osdc, req);
+
+       down_read(&osdc->map_sem);
+       mutex_lock(&osdc->request_mutex);
+       /*
+        * a racing kick_requests() may have sent the message for us
+        * while we dropped request_mutex above, so only send now if
+        * the request still han't been touched yet.
+        */
+       if (req->r_sent == 0) {
+               rc = __send_request(osdc, req);
+               if (rc) {
+                       if (nofail) {
+                               dout("osdc_start_request failed send, "
+                                    " marking %lld\n", req->r_tid);
+                               req->r_resend = true;
+                               rc = 0;
+                       } else {
+                               __unregister_request(osdc, req);
+                       }
+               }
+       }
+       mutex_unlock(&osdc->request_mutex);
+       up_read(&osdc->map_sem);
+       return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_start_request);
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+                          struct ceph_osd_request *req)
+{
+       int rc;
+
+       rc = wait_for_completion_interruptible(&req->r_completion);
+       if (rc < 0) {
+               mutex_lock(&osdc->request_mutex);
+               __cancel_request(req);
+               __unregister_request(osdc, req);
+               mutex_unlock(&osdc->request_mutex);
+               dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+               return rc;
+       }
+
+       dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+       return req->r_result;
+}
+EXPORT_SYMBOL(ceph_osdc_wait_request);
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+       struct ceph_osd_request *req;
+       u64 last_tid, next_tid = 0;
+
+       mutex_lock(&osdc->request_mutex);
+       last_tid = osdc->last_tid;
+       while (1) {
+               req = __lookup_request_ge(osdc, next_tid);
+               if (!req)
+                       break;
+               if (req->r_tid > last_tid)
+                       break;
+
+               next_tid = req->r_tid + 1;
+               if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+                       continue;
+
+               ceph_osdc_get_request(req);
+               mutex_unlock(&osdc->request_mutex);
+               dout("sync waiting on tid %llu (last is %llu)\n",
+                    req->r_tid, last_tid);
+               wait_for_completion(&req->r_safe_completion);
+               mutex_lock(&osdc->request_mutex);
+               ceph_osdc_put_request(req);
+       }
+       mutex_unlock(&osdc->request_mutex);
+       dout("sync done (thru tid %llu)\n", last_tid);
+}
+EXPORT_SYMBOL(ceph_osdc_sync);
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+       int err;
+
+       dout("init\n");
+       osdc->client = client;
+       osdc->osdmap = NULL;
+       init_rwsem(&osdc->map_sem);
+       init_completion(&osdc->map_waiters);
+       osdc->last_requested_map = 0;
+       mutex_init(&osdc->request_mutex);
+       osdc->last_tid = 0;
+       osdc->osds = RB_ROOT;
+       INIT_LIST_HEAD(&osdc->osd_lru);
+       osdc->requests = RB_ROOT;
+       INIT_LIST_HEAD(&osdc->req_lru);
+       osdc->num_requests = 0;
+       INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+       INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+       schedule_delayed_work(&osdc->osds_timeout_work,
+          round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+
+       err = -ENOMEM;
+       osdc->req_mempool = mempool_create_kmalloc_pool(10,
+                                       sizeof(struct ceph_osd_request));
+       if (!osdc->req_mempool)
+               goto out;
+
+       err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                               "osd_op");
+       if (err < 0)
+               goto out_mempool;
+       err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+                               OSD_OPREPLY_FRONT_LEN, 10, true,
+                               "osd_op_reply");
+       if (err < 0)
+               goto out_msgpool;
+       return 0;
+
+out_msgpool:
+       ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+       mempool_destroy(osdc->req_mempool);
+out:
+       return err;
+}
+EXPORT_SYMBOL(ceph_osdc_init);
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+       cancel_delayed_work_sync(&osdc->timeout_work);
+       cancel_delayed_work_sync(&osdc->osds_timeout_work);
+       if (osdc->osdmap) {
+               ceph_osdmap_destroy(osdc->osdmap);
+               osdc->osdmap = NULL;
+       }
+       remove_old_osds(osdc, 1);
+       mempool_destroy(osdc->req_mempool);
+       ceph_msgpool_destroy(&osdc->msgpool_op);
+       ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+EXPORT_SYMBOL(ceph_osdc_stop);
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+                       struct ceph_vino vino, struct ceph_file_layout *layout,
+                       u64 off, u64 *plen,
+                       u32 truncate_seq, u64 truncate_size,
+                       struct page **pages, int num_pages)
+{
+       struct ceph_osd_request *req;
+       int rc = 0;
+
+       dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+            vino.snap, off, *plen);
+       req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+                                   CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+                                   NULL, 0, truncate_seq, truncate_size, NULL,
+                                   false, 1);
+       if (!req)
+               return -ENOMEM;
+
+       /* it may be a short read due to an object boundary */
+       req->r_pages = pages;
+
+       dout("readpages  final extent is %llu~%llu (%d pages)\n",
+            off, *plen, req->r_num_pages);
+
+       rc = ceph_osdc_start_request(osdc, req, false);
+       if (!rc)
+               rc = ceph_osdc_wait_request(osdc, req);
+
+       ceph_osdc_put_request(req);
+       dout("readpages result %d\n", rc);
+       return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_readpages);
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+                        struct ceph_file_layout *layout,
+                        struct ceph_snap_context *snapc,
+                        u64 off, u64 len,
+                        u32 truncate_seq, u64 truncate_size,
+                        struct timespec *mtime,
+                        struct page **pages, int num_pages,
+                        int flags, int do_sync, bool nofail)
+{
+       struct ceph_osd_request *req;
+       int rc = 0;
+
+       BUG_ON(vino.snap != CEPH_NOSNAP);
+       req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+                                   CEPH_OSD_OP_WRITE,
+                                   flags | CEPH_OSD_FLAG_ONDISK |
+                                           CEPH_OSD_FLAG_WRITE,
+                                   snapc, do_sync,
+                                   truncate_seq, truncate_size, mtime,
+                                   nofail, 1);
+       if (!req)
+               return -ENOMEM;
+
+       /* it may be a short write due to an object boundary */
+       req->r_pages = pages;
+       dout("writepages %llu~%llu (%d pages)\n", off, len,
+            req->r_num_pages);
+
+       rc = ceph_osdc_start_request(osdc, req, nofail);
+       if (!rc)
+               rc = ceph_osdc_wait_request(osdc, req);
+
+       ceph_osdc_put_request(req);
+       if (rc == 0)
+               rc = len;
+       dout("writepages result %d\n", rc);
+       return rc;
+}
+EXPORT_SYMBOL(ceph_osdc_writepages);
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+       struct ceph_osd *osd = con->private;
+       struct ceph_osd_client *osdc;
+       int type = le16_to_cpu(msg->hdr.type);
+
+       if (!osd)
+               goto out;
+       osdc = osd->o_osdc;
+
+       switch (type) {
+       case CEPH_MSG_OSD_MAP:
+               ceph_osdc_handle_map(osdc, msg);
+               break;
+       case CEPH_MSG_OSD_OPREPLY:
+               handle_reply(osdc, msg, con);
+               break;
+
+       default:
+               pr_err("received unknown message type %d %s\n", type,
+                      ceph_msg_type_name(type));
+       }
+out:
+       ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+                                 struct ceph_msg_header *hdr,
+                                 int *skip)
+{
+       struct ceph_osd *osd = con->private;
+       struct ceph_osd_client *osdc = osd->o_osdc;
+       struct ceph_msg *m;
+       struct ceph_osd_request *req;
+       int front = le32_to_cpu(hdr->front_len);
+       int data_len = le32_to_cpu(hdr->data_len);
+       u64 tid;
+
+       tid = le64_to_cpu(hdr->tid);
+       mutex_lock(&osdc->request_mutex);
+       req = __lookup_request(osdc, tid);
+       if (!req) {
+               *skip = 1;
+               m = NULL;
+               pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+                       osd->o_osd);
+               goto out;
+       }
+
+       if (req->r_con_filling_msg) {
+               dout("get_reply revoking msg %p from old con %p\n",
+                    req->r_reply, req->r_con_filling_msg);
+               ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+               ceph_con_put(req->r_con_filling_msg);
+               req->r_con_filling_msg = NULL;
+       }
+
+       if (front > req->r_reply->front.iov_len) {
+               pr_warning("get_reply front %d > preallocated %d\n",
+                          front, (int)req->r_reply->front.iov_len);
+               m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+               if (!m)
+                       goto out;
+               ceph_msg_put(req->r_reply);
+               req->r_reply = m;
+       }
+       m = ceph_msg_get(req->r_reply);
+
+       if (data_len > 0) {
+               unsigned data_off = le16_to_cpu(hdr->data_off);
+               int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+               if (unlikely(req->r_num_pages < want)) {
+                       pr_warning("tid %lld reply %d > expected %d pages\n",
+                                  tid, want, m->nr_pages);
+                       *skip = 1;
+                       ceph_msg_put(m);
+                       m = NULL;
+                       goto out;
+               }
+               m->pages = req->r_pages;
+               m->nr_pages = req->r_num_pages;
+#ifdef CONFIG_BLOCK
+               m->bio = req->r_bio;
+#endif
+       }
+       *skip = 0;
+       req->r_con_filling_msg = ceph_con_get(con);
+       dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+       mutex_unlock(&osdc->request_mutex);
+       return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+                                 struct ceph_msg_header *hdr,
+                                 int *skip)
+{
+       struct ceph_osd *osd = con->private;
+       int type = le16_to_cpu(hdr->type);
+       int front = le32_to_cpu(hdr->front_len);
+
+       switch (type) {
+       case CEPH_MSG_OSD_MAP:
+               return ceph_msg_new(type, front, GFP_NOFS);
+       case CEPH_MSG_OSD_OPREPLY:
+               return get_reply(con, hdr, skip);
+       default:
+               pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+                       osd->o_osd);
+               *skip = 1;
+               return NULL;
+       }
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+       struct ceph_osd *osd = con->private;
+       if (get_osd(osd))
+               return con;
+       return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+       struct ceph_osd *osd = con->private;
+       put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                         void **buf, int *len, int *proto,
+                         void **reply_buf, int *reply_len, int force_new)
+{
+       struct ceph_osd *o = con->private;
+       struct ceph_osd_client *osdc = o->o_osdc;
+       struct ceph_auth_client *ac = osdc->client->monc.auth;
+       int ret = 0;
+
+       if (force_new && o->o_authorizer) {
+               ac->ops->destroy_authorizer(ac, o->o_authorizer);
+               o->o_authorizer = NULL;
+       }
+       if (o->o_authorizer == NULL) {
+               ret = ac->ops->create_authorizer(
+                       ac, CEPH_ENTITY_TYPE_OSD,
+                       &o->o_authorizer,
+                       &o->o_authorizer_buf,
+                       &o->o_authorizer_buf_len,
+                       &o->o_authorizer_reply_buf,
+                       &o->o_authorizer_reply_buf_len);
+               if (ret)
+                       return ret;
+       }
+
+       *proto = ac->protocol;
+       *buf = o->o_authorizer_buf;
+       *len = o->o_authorizer_buf_len;
+       *reply_buf = o->o_authorizer_reply_buf;
+       *reply_len = o->o_authorizer_reply_buf_len;
+       return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+       struct ceph_osd *o = con->private;
+       struct ceph_osd_client *osdc = o->o_osdc;
+       struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+       return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+       struct ceph_osd *o = con->private;
+       struct ceph_osd_client *osdc = o->o_osdc;
+       struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+       if (ac->ops->invalidate_authorizer)
+               ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+       return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+static const struct ceph_connection_operations osd_con_ops = {
+       .get = get_osd_con,
+       .put = put_osd_con,
+       .dispatch = dispatch,
+       .get_authorizer = get_authorizer,
+       .verify_authorizer_reply = verify_authorizer_reply,
+       .invalidate_authorizer = invalidate_authorizer,
+       .alloc_msg = alloc_msg,
+       .fault = osd_reset,
+};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c

new file mode 100644 (file)

index 0000000..d73f3f6
--- /dev/null
+++ b/net/ceph/osdmap.c
@@ -0,0 +1,1128 @@
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/div64.h>
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+       int flag = 0;
+
+       if (!len)
+               goto done;
+
+       *str = '\0';
+       if (state) {
+               if (state & CEPH_OSD_EXISTS) {
+                       snprintf(str, len, "exists");
+                       flag = 1;
+               }
+               if (state & CEPH_OSD_UP) {
+                       snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+                                "up");
+                       flag = 1;
+               }
+       } else {
+               snprintf(str, len, "doesn't exist");
+       }
+done:
+       return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+       int b = 0;
+       while (t) {
+               t = t >> 1;
+               b++;
+       }
+       return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+       pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+       pi->pgp_num_mask =
+               (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+       pi->lpg_num_mask =
+               (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+       pi->lpgp_num_mask =
+               (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+                                      struct crush_bucket_uniform *b)
+{
+       dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+       ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+       b->item_weight = ceph_decode_32(p);
+       return 0;
+bad:
+       return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+                                   struct crush_bucket_list *b)
+{
+       int j;
+       dout("crush_decode_list_bucket %p to %p\n", *p, end);
+       b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+       if (b->item_weights == NULL)
+               return -ENOMEM;
+       b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+       if (b->sum_weights == NULL)
+               return -ENOMEM;
+       ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+       for (j = 0; j < b->h.size; j++) {
+               b->item_weights[j] = ceph_decode_32(p);
+               b->sum_weights[j] = ceph_decode_32(p);
+       }
+       return 0;
+bad:
+       return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+                                   struct crush_bucket_tree *b)
+{
+       int j;
+       dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+       ceph_decode_32_safe(p, end, b->num_nodes, bad);
+       b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+       if (b->node_weights == NULL)
+               return -ENOMEM;
+       ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+       for (j = 0; j < b->num_nodes; j++)
+               b->node_weights[j] = ceph_decode_32(p);
+       return 0;
+bad:
+       return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+                                    struct crush_bucket_straw *b)
+{
+       int j;
+       dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+       b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+       if (b->item_weights == NULL)
+               return -ENOMEM;
+       b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+       if (b->straws == NULL)
+               return -ENOMEM;
+       ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+       for (j = 0; j < b->h.size; j++) {
+               b->item_weights[j] = ceph_decode_32(p);
+               b->straws[j] = ceph_decode_32(p);
+       }
+       return 0;
+bad:
+       return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+       struct crush_map *c;
+       int err = -EINVAL;
+       int i, j;
+       void **p = &pbyval;
+       void *start = pbyval;
+       u32 magic;
+
+       dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+       c = kzalloc(sizeof(*c), GFP_NOFS);
+       if (c == NULL)
+               return ERR_PTR(-ENOMEM);
+
+       ceph_decode_need(p, end, 4*sizeof(u32), bad);
+       magic = ceph_decode_32(p);
+       if (magic != CRUSH_MAGIC) {
+               pr_err("crush_decode magic %x != current %x\n",
+                      (unsigned)magic, (unsigned)CRUSH_MAGIC);
+               goto bad;
+       }
+       c->max_buckets = ceph_decode_32(p);
+       c->max_rules = ceph_decode_32(p);
+       c->max_devices = ceph_decode_32(p);
+
+       c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+       if (c->device_parents == NULL)
+               goto badmem;
+       c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+       if (c->bucket_parents == NULL)
+               goto badmem;
+
+       c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+       if (c->buckets == NULL)
+               goto badmem;
+       c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+       if (c->rules == NULL)
+               goto badmem;
+
+       /* buckets */
+       for (i = 0; i < c->max_buckets; i++) {
+               int size = 0;
+               u32 alg;
+               struct crush_bucket *b;
+
+               ceph_decode_32_safe(p, end, alg, bad);
+               if (alg == 0) {
+                       c->buckets[i] = NULL;
+                       continue;
+               }
+               dout("crush_decode bucket %d off %x %p to %p\n",
+                    i, (int)(*p-start), *p, end);
+
+               switch (alg) {
+               case CRUSH_BUCKET_UNIFORM:
+                       size = sizeof(struct crush_bucket_uniform);
+                       break;
+               case CRUSH_BUCKET_LIST:
+                       size = sizeof(struct crush_bucket_list);
+                       break;
+               case CRUSH_BUCKET_TREE:
+                       size = sizeof(struct crush_bucket_tree);
+                       break;
+               case CRUSH_BUCKET_STRAW:
+                       size = sizeof(struct crush_bucket_straw);
+                       break;
+               default:
+                       err = -EINVAL;
+                       goto bad;
+               }
+               BUG_ON(size == 0);
+               b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+               if (b == NULL)
+                       goto badmem;
+
+               ceph_decode_need(p, end, 4*sizeof(u32), bad);
+               b->id = ceph_decode_32(p);
+               b->type = ceph_decode_16(p);
+               b->alg = ceph_decode_8(p);
+               b->hash = ceph_decode_8(p);
+               b->weight = ceph_decode_32(p);
+               b->size = ceph_decode_32(p);
+
+               dout("crush_decode bucket size %d off %x %p to %p\n",
+                    b->size, (int)(*p-start), *p, end);
+
+               b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+               if (b->items == NULL)
+                       goto badmem;
+               b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+               if (b->perm == NULL)
+                       goto badmem;
+               b->perm_n = 0;
+
+               ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+               for (j = 0; j < b->size; j++)
+                       b->items[j] = ceph_decode_32(p);
+
+               switch (b->alg) {
+               case CRUSH_BUCKET_UNIFORM:
+                       err = crush_decode_uniform_bucket(p, end,
+                                 (struct crush_bucket_uniform *)b);
+                       if (err < 0)
+                               goto bad;
+                       break;
+               case CRUSH_BUCKET_LIST:
+                       err = crush_decode_list_bucket(p, end,
+                              (struct crush_bucket_list *)b);
+                       if (err < 0)
+                               goto bad;
+                       break;
+               case CRUSH_BUCKET_TREE:
+                       err = crush_decode_tree_bucket(p, end,
+                               (struct crush_bucket_tree *)b);
+                       if (err < 0)
+                               goto bad;
+                       break;
+               case CRUSH_BUCKET_STRAW:
+                       err = crush_decode_straw_bucket(p, end,
+                               (struct crush_bucket_straw *)b);
+                       if (err < 0)
+                               goto bad;
+                       break;
+               }
+       }
+
+       /* rules */
+       dout("rule vec is %p\n", c->rules);
+       for (i = 0; i < c->max_rules; i++) {
+               u32 yes;
+               struct crush_rule *r;
+
+               ceph_decode_32_safe(p, end, yes, bad);
+               if (!yes) {
+                       dout("crush_decode NO rule %d off %x %p to %p\n",
+                            i, (int)(*p-start), *p, end);
+                       c->rules[i] = NULL;
+                       continue;
+               }
+
+               dout("crush_decode rule %d off %x %p to %p\n",
+                    i, (int)(*p-start), *p, end);
+
+               /* len */
+               ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+               err = -EINVAL;
+               if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+                       goto bad;
+#endif
+               r = c->rules[i] = kmalloc(sizeof(*r) +
+                                         yes*sizeof(struct crush_rule_step),
+                                         GFP_NOFS);
+               if (r == NULL)
+                       goto badmem;
+               dout(" rule %d is at %p\n", i, r);
+               r->len = yes;
+               ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+               ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+               for (j = 0; j < r->len; j++) {
+                       r->steps[j].op = ceph_decode_32(p);
+                       r->steps[j].arg1 = ceph_decode_32(p);
+                       r->steps[j].arg2 = ceph_decode_32(p);
+               }
+       }
+
+       /* ignore trailing name maps. */
+
+       dout("crush_decode success\n");
+       return c;
+
+badmem:
+       err = -ENOMEM;
+bad:
+       dout("crush_decode fail %d\n", err);
+       crush_destroy(c);
+       return ERR_PTR(err);
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+       u64 a = *(u64 *)&l;
+       u64 b = *(u64 *)&r;
+
+       if (a < b)
+               return -1;
+       if (a > b)
+               return 1;
+       return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+                              struct rb_root *root)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_pg_mapping *pg = NULL;
+       int c;
+
+       while (*p) {
+               parent = *p;
+               pg = rb_entry(parent, struct ceph_pg_mapping, node);
+               c = pgid_cmp(new->pgid, pg->pgid);
+               if (c < 0)
+                       p = &(*p)->rb_left;
+               else if (c > 0)
+                       p = &(*p)->rb_right;
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, root);
+       return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+                                                  struct ceph_pg pgid)
+{
+       struct rb_node *n = root->rb_node;
+       struct ceph_pg_mapping *pg;
+       int c;
+
+       while (n) {
+               pg = rb_entry(n, struct ceph_pg_mapping, node);
+               c = pgid_cmp(pgid, pg->pgid);
+               if (c < 0)
+                       n = n->rb_left;
+               else if (c > 0)
+                       n = n->rb_right;
+               else
+                       return pg;
+       }
+       return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+       struct rb_node **p = &root->rb_node;
+       struct rb_node *parent = NULL;
+       struct ceph_pg_pool_info *pi = NULL;
+
+       while (*p) {
+               parent = *p;
+               pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+               if (new->id < pi->id)
+                       p = &(*p)->rb_left;
+               else if (new->id > pi->id)
+                       p = &(*p)->rb_right;
+               else
+                       return -EEXIST;
+       }
+
+       rb_link_node(&new->node, parent, p);
+       rb_insert_color(&new->node, root);
+       return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+       struct ceph_pg_pool_info *pi;
+       struct rb_node *n = root->rb_node;
+
+       while (n) {
+               pi = rb_entry(n, struct ceph_pg_pool_info, node);
+               if (id < pi->id)
+                       n = n->rb_left;
+               else if (id > pi->id)
+                       n = n->rb_right;
+               else
+                       return pi;
+       }
+       return NULL;
+}
+
+int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
+{
+       struct rb_node *rbp;
+
+       for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
+               struct ceph_pg_pool_info *pi =
+                       rb_entry(rbp, struct ceph_pg_pool_info, node);
+               if (pi->name && strcmp(pi->name, name) == 0)
+                       return pi->id;
+       }
+       return -ENOENT;
+}
+EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+       rb_erase(&pi->node, root);
+       kfree(pi->name);
+       kfree(pi);
+}
+
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+{
+       unsigned n, m;
+
+       ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+       calc_pg_masks(pi);
+
+       /* num_snaps * snap_info_t */
+       n = le32_to_cpu(pi->v.num_snaps);
+       while (n--) {
+               ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+                                sizeof(struct ceph_timespec), bad);
+               *p += sizeof(u64) +       /* key */
+                       1 + sizeof(u64) + /* u8, snapid */
+                       sizeof(struct ceph_timespec);
+               m = ceph_decode_32(p);    /* snap name */
+               *p += m;
+       }
+
+       *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+       return 0;
+
+bad:
+       return -EINVAL;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+       struct ceph_pg_pool_info *pi;
+       u32 num, len, pool;
+
+       ceph_decode_32_safe(p, end, num, bad);
+       dout(" %d pool names\n", num);
+       while (num--) {
+               ceph_decode_32_safe(p, end, pool, bad);
+               ceph_decode_32_safe(p, end, len, bad);
+               dout("  pool %d len %d\n", pool, len);
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (pi) {
+                       kfree(pi->name);
+                       pi->name = kmalloc(len + 1, GFP_NOFS);
+                       if (pi->name) {
+                               memcpy(pi->name, *p, len);
+                               pi->name[len] = '\0';
+                               dout("  name is %s\n", pi->name);
+                       }
+               }
+               *p += len;
+       }
+       return 0;
+
+bad:
+       return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+       dout("osdmap_destroy %p\n", map);
+       if (map->crush)
+               crush_destroy(map->crush);
+       while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+               struct ceph_pg_mapping *pg =
+                       rb_entry(rb_first(&map->pg_temp),
+                                struct ceph_pg_mapping, node);
+               rb_erase(&pg->node, &map->pg_temp);
+               kfree(pg);
+       }
+       while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+               struct ceph_pg_pool_info *pi =
+                       rb_entry(rb_first(&map->pg_pools),
+                                struct ceph_pg_pool_info, node);
+               __remove_pg_pool(&map->pg_pools, pi);
+       }
+       kfree(map->osd_state);
+       kfree(map->osd_weight);
+       kfree(map->osd_addr);
+       kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+       u8 *state;
+       struct ceph_entity_addr *addr;
+       u32 *weight;
+
+       state = kcalloc(max, sizeof(*state), GFP_NOFS);
+       addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+       weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+       if (state == NULL || addr == NULL || weight == NULL) {
+               kfree(state);
+               kfree(addr);
+               kfree(weight);
+               return -ENOMEM;
+       }
+
+       /* copy old? */
+       if (map->osd_state) {
+               memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+               memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+               memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+               kfree(map->osd_state);
+               kfree(map->osd_addr);
+               kfree(map->osd_weight);
+       }
+
+       map->osd_state = state;
+       map->osd_weight = weight;
+       map->osd_addr = addr;
+       map->max_osd = max;
+       return 0;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+       struct ceph_osdmap *map;
+       u16 version;
+       u32 len, max, i;
+       u8 ev;
+       int err = -EINVAL;
+       void *start = *p;
+       struct ceph_pg_pool_info *pi;
+
+       dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+       map = kzalloc(sizeof(*map), GFP_NOFS);
+       if (map == NULL)
+               return ERR_PTR(-ENOMEM);
+       map->pg_temp = RB_ROOT;
+
+       ceph_decode_16_safe(p, end, version, bad);
+       if (version > CEPH_OSDMAP_VERSION) {
+               pr_warning("got unknown v %d > %d of osdmap\n", version,
+                          CEPH_OSDMAP_VERSION);
+               goto bad;
+       }
+
+       ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+       ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+       map->epoch = ceph_decode_32(p);
+       ceph_decode_copy(p, &map->created, sizeof(map->created));
+       ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+       ceph_decode_32_safe(p, end, max, bad);
+       while (max--) {
+               ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+               pi = kzalloc(sizeof(*pi), GFP_NOFS);
+               if (!pi)
+                       goto bad;
+               pi->id = ceph_decode_32(p);
+               ev = ceph_decode_8(p); /* encoding version */
+               if (ev > CEPH_PG_POOL_VERSION) {
+                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                  ev, CEPH_PG_POOL_VERSION);
+                       kfree(pi);
+                       goto bad;
+               }
+               err = __decode_pool(p, end, pi);
+               if (err < 0)
+                       goto bad;
+               __insert_pg_pool(&map->pg_pools, pi);
+       }
+
+       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+               goto bad;
+
+       ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+       ceph_decode_32_safe(p, end, map->flags, bad);
+
+       max = ceph_decode_32(p);
+
+       /* (re)alloc osd arrays */
+       err = osdmap_set_max_osd(map, max);
+       if (err < 0)
+               goto bad;
+       dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+       /* osds */
+       err = -EINVAL;
+       ceph_decode_need(p, end, 3*sizeof(u32) +
+                        map->max_osd*(1 + sizeof(*map->osd_weight) +
+                                      sizeof(*map->osd_addr)), bad);
+       *p += 4; /* skip length field (should match max) */
+       ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+       *p += 4; /* skip length field (should match max) */
+       for (i = 0; i < map->max_osd; i++)
+               map->osd_weight[i] = ceph_decode_32(p);
+
+       *p += 4; /* skip length field (should match max) */
+       ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+       for (i = 0; i < map->max_osd; i++)
+               ceph_decode_addr(&map->osd_addr[i]);
+
+       /* pg_temp */
+       ceph_decode_32_safe(p, end, len, bad);
+       for (i = 0; i < len; i++) {
+               int n, j;
+               struct ceph_pg pgid;
+               struct ceph_pg_mapping *pg;
+
+               ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+               ceph_decode_copy(p, &pgid, sizeof(pgid));
+               n = ceph_decode_32(p);
+               ceph_decode_need(p, end, n * sizeof(u32), bad);
+               err = -ENOMEM;
+               pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+               if (!pg)
+                       goto bad;
+               pg->pgid = pgid;
+               pg->len = n;
+               for (j = 0; j < n; j++)
+                       pg->osds[j] = ceph_decode_32(p);
+
+               err = __insert_pg_mapping(pg, &map->pg_temp);
+               if (err)
+                       goto bad;
+               dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+       }
+
+       /* crush */
+       ceph_decode_32_safe(p, end, len, bad);
+       dout("osdmap_decode crush len %d from off 0x%x\n", len,
+            (int)(*p - start));
+       ceph_decode_need(p, end, len, bad);
+       map->crush = crush_decode(*p, end);
+       *p += len;
+       if (IS_ERR(map->crush)) {
+               err = PTR_ERR(map->crush);
+               map->crush = NULL;
+               goto bad;
+       }
+
+       /* ignore the rest of the map */
+       *p = end;
+
+       dout("osdmap_decode done %p %p\n", *p, end);
+       return map;
+
+bad:
+       dout("osdmap_decode fail\n");
+       ceph_osdmap_destroy(map);
+       return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+                                            struct ceph_osdmap *map,
+                                            struct ceph_messenger *msgr)
+{
+       struct crush_map *newcrush = NULL;
+       struct ceph_fsid fsid;
+       u32 epoch = 0;
+       struct ceph_timespec modified;
+       u32 len, pool;
+       __s32 new_pool_max, new_flags, max;
+       void *start = *p;
+       int err = -EINVAL;
+       u16 version;
+       struct rb_node *rbp;
+
+       ceph_decode_16_safe(p, end, version, bad);
+       if (version > CEPH_OSDMAP_INC_VERSION) {
+               pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+                          CEPH_OSDMAP_INC_VERSION);
+               goto bad;
+       }
+
+       ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+                        bad);
+       ceph_decode_copy(p, &fsid, sizeof(fsid));
+       epoch = ceph_decode_32(p);
+       BUG_ON(epoch != map->epoch+1);
+       ceph_decode_copy(p, &modified, sizeof(modified));
+       new_pool_max = ceph_decode_32(p);
+       new_flags = ceph_decode_32(p);
+
+       /* full map? */
+       ceph_decode_32_safe(p, end, len, bad);
+       if (len > 0) {
+               dout("apply_incremental full map len %d, %p to %p\n",
+                    len, *p, end);
+               return osdmap_decode(p, min(*p+len, end));
+       }
+
+       /* new crush? */
+       ceph_decode_32_safe(p, end, len, bad);
+       if (len > 0) {
+               dout("apply_incremental new crush map len %d, %p to %p\n",
+                    len, *p, end);
+               newcrush = crush_decode(*p, min(*p+len, end));
+               if (IS_ERR(newcrush))
+                       return ERR_CAST(newcrush);
+               *p += len;
+       }
+
+       /* new flags? */
+       if (new_flags >= 0)
+               map->flags = new_flags;
+       if (new_pool_max >= 0)
+               map->pool_max = new_pool_max;
+
+       ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+       /* new max? */
+       max = ceph_decode_32(p);
+       if (max >= 0) {
+               err = osdmap_set_max_osd(map, max);
+               if (err < 0)
+                       goto bad;
+       }
+
+       map->epoch++;
+       map->modified = map->modified;
+       if (newcrush) {
+               if (map->crush)
+                       crush_destroy(map->crush);
+               map->crush = newcrush;
+               newcrush = NULL;
+       }
+
+       /* new_pool */
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               __u8 ev;
+               struct ceph_pg_pool_info *pi;
+
+               ceph_decode_32_safe(p, end, pool, bad);
+               ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+               ev = ceph_decode_8(p);  /* encoding version */
+               if (ev > CEPH_PG_POOL_VERSION) {
+                       pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+                                  ev, CEPH_PG_POOL_VERSION);
+                       goto bad;
+               }
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (!pi) {
+                       pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                       if (!pi) {
+                               err = -ENOMEM;
+                               goto bad;
+                       }
+                       pi->id = pool;
+                       __insert_pg_pool(&map->pg_pools, pi);
+               }
+               err = __decode_pool(p, end, pi);
+               if (err < 0)
+                       goto bad;
+       }
+       if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+               goto bad;
+
+       /* old_pool */
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               struct ceph_pg_pool_info *pi;
+
+               ceph_decode_32_safe(p, end, pool, bad);
+               pi = __lookup_pg_pool(&map->pg_pools, pool);
+               if (pi)
+                       __remove_pg_pool(&map->pg_pools, pi);
+       }
+
+       /* new_up */
+       err = -EINVAL;
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               u32 osd;
+               struct ceph_entity_addr addr;
+               ceph_decode_32_safe(p, end, osd, bad);
+               ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+               ceph_decode_addr(&addr);
+               pr_info("osd%d up\n", osd);
+               BUG_ON(osd >= map->max_osd);
+               map->osd_state[osd] |= CEPH_OSD_UP;
+               map->osd_addr[osd] = addr;
+       }
+
+       /* new_down */
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               u32 osd;
+               ceph_decode_32_safe(p, end, osd, bad);
+               (*p)++;  /* clean flag */
+               pr_info("osd%d down\n", osd);
+               if (osd < map->max_osd)
+                       map->osd_state[osd] &= ~CEPH_OSD_UP;
+       }
+
+       /* new_weight */
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               u32 osd, off;
+               ceph_decode_need(p, end, sizeof(u32)*2, bad);
+               osd = ceph_decode_32(p);
+               off = ceph_decode_32(p);
+               pr_info("osd%d weight 0x%x %s\n", osd, off,
+                    off == CEPH_OSD_IN ? "(in)" :
+                    (off == CEPH_OSD_OUT ? "(out)" : ""));
+               if (osd < map->max_osd)
+                       map->osd_weight[osd] = off;
+       }
+
+       /* new_pg_temp */
+       rbp = rb_first(&map->pg_temp);
+       ceph_decode_32_safe(p, end, len, bad);
+       while (len--) {
+               struct ceph_pg_mapping *pg;
+               int j;
+               struct ceph_pg pgid;
+               u32 pglen;
+               ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+               ceph_decode_copy(p, &pgid, sizeof(pgid));
+               pglen = ceph_decode_32(p);
+
+               /* remove any? */
+               while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+                                               node)->pgid, pgid) <= 0) {
+                       struct ceph_pg_mapping *cur =
+                               rb_entry(rbp, struct ceph_pg_mapping, node);
+
+                       rbp = rb_next(rbp);
+                       dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+                       rb_erase(&cur->node, &map->pg_temp);
+                       kfree(cur);
+               }
+
+               if (pglen) {
+                       /* insert */
+                       ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+                       pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+                       if (!pg) {
+                               err = -ENOMEM;
+                               goto bad;
+                       }
+                       pg->pgid = pgid;
+                       pg->len = pglen;
+                       for (j = 0; j < pglen; j++)
+                               pg->osds[j] = ceph_decode_32(p);
+                       err = __insert_pg_mapping(pg, &map->pg_temp);
+                       if (err) {
+                               kfree(pg);
+                               goto bad;
+                       }
+                       dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+                            pglen);
+               }
+       }
+       while (rbp) {
+               struct ceph_pg_mapping *cur =
+                       rb_entry(rbp, struct ceph_pg_mapping, node);
+
+               rbp = rb_next(rbp);
+               dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
+               rb_erase(&cur->node, &map->pg_temp);
+               kfree(cur);
+       }
+
+       /* ignore the rest */
+       *p = end;
+       return map;
+
+bad:
+       pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+              epoch, (int)(*p - start), *p, start, end);
+       print_hex_dump(KERN_DEBUG, "osdmap: ",
+                      DUMP_PREFIX_OFFSET, 16, 1,
+                      start, end - start, true);
+       if (newcrush)
+               crush_destroy(newcrush);
+       return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+                                  u64 off, u64 *plen,
+                                  u64 *ono,
+                                  u64 *oxoff, u64 *oxlen)
+{
+       u32 osize = le32_to_cpu(layout->fl_object_size);
+       u32 su = le32_to_cpu(layout->fl_stripe_unit);
+       u32 sc = le32_to_cpu(layout->fl_stripe_count);
+       u32 bl, stripeno, stripepos, objsetno;
+       u32 su_per_object;
+       u64 t, su_offset;
+
+       dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+            osize, su);
+       su_per_object = osize / su;
+       dout("osize %u / su %u = su_per_object %u\n", osize, su,
+            su_per_object);
+
+       BUG_ON((su & ~PAGE_MASK) != 0);
+       /* bl = *off / su; */
+       t = off;
+       do_div(t, su);
+       bl = t;
+       dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+       stripeno = bl / sc;
+       stripepos = bl % sc;
+       objsetno = stripeno / su_per_object;
+
+       *ono = objsetno * sc + stripepos;
+       dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+       /* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+       t = off;
+       su_offset = do_div(t, su);
+       *oxoff = su_offset + (stripeno % su_per_object) * su;
+
+       /*
+        * Calculate the length of the extent being written to the selected
+        * object. This is the minimum of the full length requested (plen) or
+        * the remainder of the current stripe being written to.
+        */
+       *oxlen = min_t(u64, *plen, su - su_offset);
+       *plen = *oxlen;
+
+       dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+EXPORT_SYMBOL(ceph_calc_file_object_mapping);
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+                           const char *oid,
+                           struct ceph_file_layout *fl,
+                           struct ceph_osdmap *osdmap)
+{
+       unsigned num, num_mask;
+       struct ceph_pg pgid;
+       s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+       int poolid = le32_to_cpu(fl->fl_pg_pool);
+       struct ceph_pg_pool_info *pool;
+       unsigned ps;
+
+       BUG_ON(!osdmap);
+
+       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+       if (!pool)
+               return -EIO;
+       ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+       if (preferred >= 0) {
+               ps += preferred;
+               num = le32_to_cpu(pool->v.lpg_num);
+               num_mask = pool->lpg_num_mask;
+       } else {
+               num = le32_to_cpu(pool->v.pg_num);
+               num_mask = pool->pg_num_mask;
+       }
+
+       pgid.ps = cpu_to_le16(ps);
+       pgid.preferred = cpu_to_le16(preferred);
+       pgid.pool = fl->fl_pg_pool;
+       if (preferred >= 0)
+               dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+                    (int)preferred);
+       else
+               dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+       ol->ol_pgid = pgid;
+       ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+       return 0;
+}
+EXPORT_SYMBOL(ceph_calc_object_layout);
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                       int *osds, int *num)
+{
+       struct ceph_pg_mapping *pg;
+       struct ceph_pg_pool_info *pool;
+       int ruleno;
+       unsigned poolid, ps, pps;
+       int preferred;
+
+       /* pg_temp? */
+       pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+       if (pg) {
+               *num = pg->len;
+               return pg->osds;
+       }
+
+       /* crush */
+       poolid = le32_to_cpu(pgid.pool);
+       ps = le16_to_cpu(pgid.ps);
+       preferred = (s16)le16_to_cpu(pgid.preferred);
+
+       /* don't forcefeed bad device ids to crush */
+       if (preferred >= osdmap->max_osd ||
+           preferred >= osdmap->crush->max_devices)
+               preferred = -1;
+
+       pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+       if (!pool)
+               return NULL;
+       ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+                                pool->v.type, pool->v.size);
+       if (ruleno < 0) {
+               pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
+                      poolid, pool->v.crush_ruleset, pool->v.type,
+                      pool->v.size);
+               return NULL;
+       }
+
+       if (preferred >= 0)
+               pps = ceph_stable_mod(ps,
+                                     le32_to_cpu(pool->v.lpgp_num),
+                                     pool->lpgp_num_mask);
+       else
+               pps = ceph_stable_mod(ps,
+                                     le32_to_cpu(pool->v.pgp_num),
+                                     pool->pgp_num_mask);
+       pps += poolid;
+       *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+                            min_t(int, pool->v.size, *num),
+                            preferred, osdmap->osd_weight);
+       return osds;
+}
+
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                       int *acting)
+{
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, o, num = CEPH_PG_MAX_SIZE;
+
+       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+       if (!osds)
+               return -1;
+
+       /* primary is first up osd */
+       o = 0;
+       for (i = 0; i < num; i++)
+               if (ceph_osd_is_up(osdmap, osds[i]))
+                       acting[o++] = osds[i];
+       return o;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+       int rawosds[CEPH_PG_MAX_SIZE], *osds;
+       int i, num = CEPH_PG_MAX_SIZE;
+
+       osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+       if (!osds)
+               return -1;
+
+       /* primary is first up osd */
+       for (i = 0; i < num; i++)
+               if (ceph_osd_is_up(osdmap, osds[i]))
+                       return osds[i];
+       return -1;
+}
+EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c

new file mode 100644 (file)

index 0000000..13cb409
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/ceph/pagelist.h>
+
+static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
+{
+       if (pl->mapped_tail) {
+               struct page *page = list_entry(pl->head.prev, struct page, lru);
+               kunmap(page);
+               pl->mapped_tail = NULL;
+       }
+}
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+       ceph_pagelist_unmap_tail(pl);
+       while (!list_empty(&pl->head)) {
+               struct page *page = list_first_entry(&pl->head, struct page,
+                                                    lru);
+               list_del(&page->lru);
+               __free_page(page);
+       }
+       ceph_pagelist_free_reserve(pl);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_release);
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+       struct page *page;
+
+       if (!pl->num_pages_free) {
+               page = __page_cache_alloc(GFP_NOFS);
+       } else {
+               page = list_first_entry(&pl->free_list, struct page, lru);
+               list_del(&page->lru);
+               --pl->num_pages_free;
+       }
+       if (!page)
+               return -ENOMEM;
+       pl->room += PAGE_SIZE;
+       ceph_pagelist_unmap_tail(pl);
+       list_add_tail(&page->lru, &pl->head);
+       pl->mapped_tail = kmap(page);
+       return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
+{
+       while (pl->room < len) {
+               size_t bit = pl->room;
+               int ret;
+
+               memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+                      buf, bit);
+               pl->length += bit;
+               pl->room -= bit;
+               buf += bit;
+               len -= bit;
+               ret = ceph_pagelist_addpage(pl);
+               if (ret)
+                       return ret;
+       }
+
+       memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+       pl->length += len;
+       pl->room -= len;
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_append);
+
+/**
+ * Allocate enough pages for a pagelist to append the given amount
+ * of data without without allocating.
+ * Returns: 0 on success, -ENOMEM on error.
+ */
+int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
+{
+       if (space <= pl->room)
+               return 0;
+       space -= pl->room;
+       space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
+
+       while (space > pl->num_pages_free) {
+               struct page *page = __page_cache_alloc(GFP_NOFS);
+               if (!page)
+                       return -ENOMEM;
+               list_add_tail(&page->lru, &pl->free_list);
+               ++pl->num_pages_free;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_reserve);
+
+/**
+ * Free any pages that have been preallocated.
+ */
+int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
+{
+       while (!list_empty(&pl->free_list)) {
+               struct page *page = list_first_entry(&pl->free_list,
+                                                    struct page, lru);
+               list_del(&page->lru);
+               __free_page(page);
+               --pl->num_pages_free;
+       }
+       BUG_ON(pl->num_pages_free);
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_free_reserve);
+
+/**
+ * Create a truncation point.
+ */
+void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+                             struct ceph_pagelist_cursor *c)
+{
+       c->pl = pl;
+       c->page_lru = pl->head.prev;
+       c->room = pl->room;
+}
+EXPORT_SYMBOL(ceph_pagelist_set_cursor);
+
+/**
+ * Truncate a pagelist to the given point. Move extra pages to reserve.
+ * This won't sleep.
+ * Returns: 0 on success,
+ *          -EINVAL if the pagelist doesn't match the trunc point pagelist
+ */
+int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+                          struct ceph_pagelist_cursor *c)
+{
+       struct page *page;
+
+       if (pl != c->pl)
+               return -EINVAL;
+       ceph_pagelist_unmap_tail(pl);
+       while (pl->head.prev != c->page_lru) {
+               page = list_entry(pl->head.prev, struct page, lru);
+               list_del(&page->lru);                /* remove from pagelist */
+               list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
+               ++pl->num_pages_free;
+       }
+       pl->room = c->room;
+       if (!list_empty(&pl->head)) {
+               page = list_entry(pl->head.prev, struct page, lru);
+               pl->mapped_tail = kmap(page);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c

new file mode 100644 (file)

index 0000000..54caf06
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include <linux/ceph/libceph.h>
+
+/*
+ * build a vector of user pages
+ */
+struct page **ceph_get_direct_page_vector(const char __user *data,
+                                                int num_pages,
+                                                loff_t off, size_t len)
+{
+       struct page **pages;
+       int rc;
+
+       pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+       if (!pages)
+               return ERR_PTR(-ENOMEM);
+
+       down_read(&current->mm->mmap_sem);
+       rc = get_user_pages(current, current->mm, (unsigned long)data,
+                           num_pages, 0, 0, pages, NULL);
+       up_read(&current->mm->mmap_sem);
+       if (rc < 0)
+               goto fail;
+       return pages;
+
+fail:
+       kfree(pages);
+       return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(ceph_get_direct_page_vector);
+
+void ceph_put_page_vector(struct page **pages, int num_pages)
+{
+       int i;
+
+       for (i = 0; i < num_pages; i++)
+               put_page(pages[i]);
+       kfree(pages);
+}
+EXPORT_SYMBOL(ceph_put_page_vector);
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+       int i;
+
+       for (i = 0; i < num_pages; i++)
+               __free_pages(pages[i], 0);
+       kfree(pages);
+}
+EXPORT_SYMBOL(ceph_release_page_vector);
+
+/*
+ * allocate a vector new pages
+ */
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+{
+       struct page **pages;
+       int i;
+
+       pages = kmalloc(sizeof(*pages) * num_pages, flags);
+       if (!pages)
+               return ERR_PTR(-ENOMEM);
+       for (i = 0; i < num_pages; i++) {
+               pages[i] = __page_cache_alloc(flags);
+               if (pages[i] == NULL) {
+                       ceph_release_page_vector(pages, i);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
+       return pages;
+}
+EXPORT_SYMBOL(ceph_alloc_page_vector);
+
+/*
+ * copy user data into a page vector
+ */
+int ceph_copy_user_to_page_vector(struct page **pages,
+                                        const char __user *data,
+                                        loff_t off, size_t len)
+{
+       int i = 0;
+       int po = off & ~PAGE_CACHE_MASK;
+       int left = len;
+       int l, bad;
+
+       while (left > 0) {
+               l = min_t(int, PAGE_CACHE_SIZE-po, left);
+               bad = copy_from_user(page_address(pages[i]) + po, data, l);
+               if (bad == l)
+                       return -EFAULT;
+               data += l - bad;
+               left -= l - bad;
+               po += l - bad;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
+
+int ceph_copy_to_page_vector(struct page **pages,
+                                   const char *data,
+                                   loff_t off, size_t len)
+{
+       int i = 0;
+       size_t po = off & ~PAGE_CACHE_MASK;
+       size_t left = len;
+       size_t l;
+
+       while (left > 0) {
+               l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+               memcpy(page_address(pages[i]) + po, data, l);
+               data += l;
+               left -= l;
+               po += l;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_to_page_vector);
+
+int ceph_copy_from_page_vector(struct page **pages,
+                                   char *data,
+                                   loff_t off, size_t len)
+{
+       int i = 0;
+       size_t po = off & ~PAGE_CACHE_MASK;
+       size_t left = len;
+       size_t l;
+
+       while (left > 0) {
+               l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+               memcpy(data, page_address(pages[i]) + po, l);
+               data += l;
+               left -= l;
+               po += l;
+               if (po == PAGE_CACHE_SIZE) {
+                       po = 0;
+                       i++;
+               }
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_from_page_vector);
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+int ceph_copy_page_vector_to_user(struct page **pages,
+                                        char __user *data,
+                                        loff_t off, size_t len)
+{
+       int i = 0;
+       int po = off & ~PAGE_CACHE_MASK;
+       int left = len;
+       int l, bad;
+
+       while (left > 0) {
+               l = min_t(int, left, PAGE_CACHE_SIZE-po);
+               bad = copy_to_user(data, page_address(pages[i]) + po, l);
+               if (bad == l)
+                       return -EFAULT;
+               data += l - bad;
+               left -= l - bad;
+               if (po) {
+                       po += l - bad;
+                       if (po == PAGE_CACHE_SIZE)
+                               po = 0;
+               }
+               i++;
+       }
+       return len;
+}
+EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+void ceph_zero_page_vector_range(int off, int len, struct page **pages)
+{
+       int i = off >> PAGE_CACHE_SHIFT;
+
+       off &= ~PAGE_CACHE_MASK;
+
+       dout("zero_page_vector_page %u~%u\n", off, len);
+
+       /* leading partial page? */
+       if (off) {
+               int end = min((int)PAGE_CACHE_SIZE, off + len);
+               dout("zeroing %d %p head from %d\n", i, pages[i],
+                    (int)off);
+               zero_user_segment(pages[i], off, end);
+               len -= (end - off);
+               i++;
+       }
+       while (len >= PAGE_CACHE_SIZE) {
+               dout("zeroing %d %p len=%d\n", i, pages[i], len);
+               zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+               len -= PAGE_CACHE_SIZE;
+               i++;
+       }
+       /* trailing partial page? */
+       if (len) {
+               dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+               zero_user_segment(pages[i], 0, len);
+       }
+}
+EXPORT_SYMBOL(ceph_zero_page_vector_range);
+
diff --git a/net/core/sock.c b/net/core/sock.c

index ef30e9d..7d99e13 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
  #ifdef CONFIG_CGROUPS
  void sock_update_classid(struct sock *sk)
  {
-       u32 classid = task_cls_classid(current);
+       u32 classid;
  
+       rcu_read_lock();  /* doing current task, which cannot vanish. */
+       classid = task_cls_classid(current);
+       rcu_read_unlock();
         if (classid && classid != sk->sk_classid)
                 sk->sk_classid = classid;
  }
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c

index 244f7cb..37f8adb 100644 (file)
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
  #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/percpu.h>
+#include <linux/security.h>
  #include <net/net_namespace.h>
  
  #include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
         rcu_read_unlock();
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       int ret;
+       u32 len;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = seq_printf(s, "secctx=%s ", secctx);
+
+       security_release_secctx(secctx, len);
+       return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       return 0;
+}
+#endif
+
  static int ct_seq_show(struct seq_file *s, void *v)
  {
         struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
                 goto release;
  #endif
  
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-       if (seq_printf(s, "secmark=%u ", ct->secmark))
+       if (ct_show_secctx(s, ct))
                 goto release;
-#endif
  
         if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
                 goto release;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c

index 8c8632d..957c924 100644 (file)
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
  static struct nf_conntrack_l3proto *l3proto __read_mostly;
  
  #define MAX_IP_NAT_PROTO 256
-static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
+static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
                                                 __read_mostly;
  
  static inline const struct nf_nat_protocol *
diff --git a/net/netfilter/core.c b/net/netfilter/core.c

index 78b505d..fdaec7d 100644 (file)
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
  
  static DEFINE_MUTEX(afinfo_mutex);
  
-const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
+const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
  EXPORT_SYMBOL(nf_afinfo);
  
  int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c

index cdcc764..5702de3 100644 (file)
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
  
  static DEFINE_MUTEX(nf_ct_ecache_mutex);
  
-struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly;
+struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
  EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
  
-struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly;
+struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
  EXPORT_SYMBOL_GPL(nf_expect_event_cb);
  
  /* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c

index 8d9e4c9..bd82450 100644 (file)
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
  #include <linux/skbuff.h>
  #include <net/netfilter/nf_conntrack_extend.h>
  
-static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM];
+static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
  static DEFINE_MUTEX(nf_ct_ext_type_mutex);
  
  void __nf_ct_ext_destroy(struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c

index 5bae1cd..146476c 100644 (file)
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
  #include <linux/rculist_nulls.h>
  #include <linux/types.h>
  #include <linux/timer.h>
+#include <linux/security.h>
  #include <linux/skbuff.h>
  #include <linux/errno.h>
  #include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
  
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
  static inline int
-ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
  {
-       NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark));
-       return 0;
+       struct nlattr *nest_secctx;
+       int len, ret;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = -1;
+       nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
+       if (!nest_secctx)
+               goto nla_put_failure;
+
+       NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
+       nla_nest_end(skb, nest_secctx);
  
+       ret = 0;
  nla_put_failure:
-       return -1;
+       security_release_secctx(secctx, len);
+       return ret;
  }
  #else
-#define ctnetlink_dump_secmark(a, b) (0)
+#define ctnetlink_dump_secctx(a, b) (0)
  #endif
  
  #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
             ctnetlink_dump_protoinfo(skb, ct) < 0 ||
             ctnetlink_dump_helpinfo(skb, ct) < 0 ||
             ctnetlink_dump_mark(skb, ct) < 0 ||
-           ctnetlink_dump_secmark(skb, ct) < 0 ||
+           ctnetlink_dump_secctx(skb, ct) < 0 ||
             ctnetlink_dump_id(skb, ct) < 0 ||
             ctnetlink_dump_use(skb, ct) < 0 ||
             ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
                ;
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
+{
+       int len;
+
+       security_secid_to_secctx(ct->secmark, NULL, &len);
+
+       return sizeof(char) * len;
+}
+#endif
+
  static inline size_t
  ctnetlink_nlmsg_size(const struct nf_conn *ct)
  {
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
                + nla_total_size(0) /* CTA_HELP */
                + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
-              + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */
+              + nla_total_size(0) /* CTA_SECCTX */
+              + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
  #endif
  #ifdef CONFIG_NF_NAT_NEEDED
                + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
  
  #ifdef CONFIG_NF_CONNTRACK_SECMARK
                 if ((events & (1 << IPCT_SECMARK) || ct->secmark)
-                   && ctnetlink_dump_secmark(skb, ct) < 0)
+                   && ctnetlink_dump_secctx(skb, ct) < 0)
                         goto nla_put_failure;
  #endif
  
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c

index 5886ba1..ed6d929 100644 (file)
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
  #include <net/netfilter/nf_conntrack_l4proto.h>
  #include <net/netfilter/nf_conntrack_core.h>
  
-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly;
-struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly;
+static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
+struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
  EXPORT_SYMBOL_GPL(nf_ct_l3protos);
  
  static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c

index eb973fc..0fb6570 100644 (file)
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
  #include <linux/seq_file.h>
  #include <linux/percpu.h>
  #include <linux/netdevice.h>
+#include <linux/security.h>
  #include <net/net_namespace.h>
  #ifdef CONFIG_SYSCTL
  #include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
         rcu_read_unlock();
  }
  
+#ifdef CONFIG_NF_CONNTRACK_SECMARK
+static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       int ret;
+       u32 len;
+       char *secctx;
+
+       ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
+       if (ret)
+               return ret;
+
+       ret = seq_printf(s, "secctx=%s ", secctx);
+
+       security_release_secctx(secctx, len);
+       return ret;
+}
+#else
+static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
+{
+       return 0;
+}
+#endif
+
  /* return 0 on success, 1 in case of error */
  static int ct_seq_show(struct seq_file *s, void *v)
  {
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
                 goto release;
  #endif
  
-#ifdef CONFIG_NF_CONNTRACK_SECMARK
-       if (seq_printf(s, "secmark=%u ", ct->secmark))
+       if (ct_show_secctx(s, ct))
                 goto release;
-#endif
  
  #ifdef CONFIG_NF_CONNTRACK_ZONES
         if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c

index 7df37fd..b07393e 100644 (file)
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
  #define NF_LOG_PREFIXLEN               128
  #define NFLOGGER_NAME_LEN              64
  
-static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
  static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
  static DEFINE_MUTEX(nf_log_mutex);
  
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c

index 78b3cf9..74aebed 100644 (file)
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
   * long term mutex.  The handler must provide an an outfn() to accept packets
   * for queueing and must reinject all packets it receives, no matter what.
   */
-static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
+static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
  
  static DEFINE_MUTEX(queue_handler_mutex);
  
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c

index 0cb6053..782e519 100644 (file)
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
  #include <linux/module.h>
  #include <linux/gfp.h>
  #include <linux/skbuff.h>
-#include <linux/selinux.h>
  #include <linux/netfilter_ipv4/ip_tables.h>
  #include <linux/netfilter_ipv6/ip6_tables.h>
  #include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c

index 23b2d6c..9faf5e0 100644 (file)
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
   */
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  #include <linux/module.h>
+#include <linux/security.h>
  #include <linux/skbuff.h>
-#include <linux/selinux.h>
  #include <linux/netfilter/x_tables.h>
  #include <linux/netfilter/xt_SECMARK.h>
  
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
  
         switch (mode) {
         case SECMARK_MODE_SEL:
-               secmark = info->u.sel.selsid;
+               secmark = info->secid;
                 break;
-
         default:
                 BUG();
         }
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
         return XT_CONTINUE;
  }
  
-static int checkentry_selinux(struct xt_secmark_target_info *info)
+static int checkentry_lsm(struct xt_secmark_target_info *info)
  {
         int err;
-       struct xt_secmark_target_selinux_info *sel = &info->u.sel;
  
-       sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0';
+       info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
+       info->secid = 0;
  
-       err = selinux_string_to_sid(sel->selctx, &sel->selsid);
+       err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
+                                      &info->secid);
         if (err) {
                 if (err == -EINVAL)
-                       pr_info("invalid SELinux context \'%s\'\n",
-                               sel->selctx);
+                       pr_info("invalid security context \'%s\'\n", info->secctx);
                 return err;
         }
  
-       if (!sel->selsid) {
-               pr_info("unable to map SELinux context \'%s\'\n", sel->selctx);
+       if (!info->secid) {
+               pr_info("unable to map security context \'%s\'\n", info->secctx);
                 return -ENOENT;
         }
  
-       err = selinux_secmark_relabel_packet_permission(sel->selsid);
+       err = security_secmark_relabel_packet(info->secid);
         if (err) {
                 pr_info("unable to obtain relabeling permission\n");
                 return err;
         }
  
-       selinux_secmark_refcount_inc();
+       security_secmark_refcount_inc();
         return 0;
  }
  
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
  
         switch (info->mode) {
         case SECMARK_MODE_SEL:
-               err = checkentry_selinux(info);
-               if (err <= 0)
-                       return err;
                 break;
-
         default:
                 pr_info("invalid mode: %hu\n", info->mode);
                 return -EINVAL;
         }
  
+       err = checkentry_lsm(info);
+       if (err)
+               return err;
+
         if (!mode)
                 mode = info->mode;
         return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
  {
         switch (mode) {
         case SECMARK_MODE_SEL:
-               selinux_secmark_refcount_dec();
+               security_secmark_refcount_dec();
         }
  }
  
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c

index 78ef2c5..37dff78 100644 (file)
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
          * calls by looking at the number of nested bh disable calls because
          * softirqs always disables bh.
          */
-       if (softirq_count() != SOFTIRQ_OFFSET) {
+       if (in_serving_softirq()) {
                 /* If there is an sk_classid we'll use that. */
                 if (!skb->sk)
                         return -1;
diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore

index 0a0a99f..4d995ae 100644 (file)
--- a/security/apparmor/.gitignore
+++ b/security/apparmor/.gitignore
@@ -3,3 +3,4 @@
  #
  af_names.h
  capability_names.h
+rlim_names.h
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c

index 7320331..544ff58 100644 (file)
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -29,7 +29,7 @@
   * aa_simple_write_to_buffer - common routine for getting policy from user
   * @op: operation doing the user buffer copy
   * @userbuf: user buffer to copy data from  (NOT NULL)
- * @alloc_size: size of user buffer
+ * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size)
   * @copy_size: size of data to copy from user buffer
   * @pos: position write is at in the file (NOT NULL)
   *
@@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
  {
         char *data;
  
+       BUG_ON(copy_size > alloc_size);
+
         if (*pos != 0)
                 /* only writes from pos 0, that is complete writes */
                 return ERR_PTR(-ESPIPE);
diff --git a/security/capability.c b/security/capability.c

index 95a6599..30ae00f 100644 (file)
--- a/security/capability.c
+++ b/security/capability.c
@@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb)
  {
  }
  
+static int cap_secmark_relabel_packet(u32 secid)
+{
+       return 0;
+}
  
+static void cap_secmark_refcount_inc(void)
+{
+}
+
+static void cap_secmark_refcount_dec(void)
+{
+}
  
  static void cap_req_classify_flow(const struct request_sock *req,
                                   struct flowi *fl)
@@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
  
  static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
  {
-       return -EOPNOTSUPP;
+       *secid = 0;
+       return 0;
  }
  
  static void cap_release_secctx(char *secdata, u32 seclen)
@@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops)
         set_to_cap_if_null(ops, inet_conn_request);
         set_to_cap_if_null(ops, inet_csk_clone);
         set_to_cap_if_null(ops, inet_conn_established);
+       set_to_cap_if_null(ops, secmark_relabel_packet);
+       set_to_cap_if_null(ops, secmark_refcount_inc);
+       set_to_cap_if_null(ops, secmark_refcount_dec);
         set_to_cap_if_null(ops, req_classify_flow);
         set_to_cap_if_null(ops, tun_dev_create);
         set_to_cap_if_null(ops, tun_dev_post_create);
diff --git a/security/commoncap.c b/security/commoncap.c

index 9d172e6..5e632b4 100644 (file)
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p)
  /**
   * cap_task_setscheduler - Detemine if scheduler policy change is permitted
   * @p: The task to affect
- * @policy: The policy to effect
- * @lp: The parameters to the scheduling policy
   *
   * Detemine if the requested scheduler policy change is permitted for the
   * specified task, returning 0 if permission is granted, -ve if denied.
   */
-int cap_task_setscheduler(struct task_struct *p, int policy,
-                          struct sched_param *lp)
+int cap_task_setscheduler(struct task_struct *p)
  {
         return cap_safe_nice(p);
  }
diff --git a/security/security.c b/security/security.c

index c53949f..b50f472 100644 (file)
--- a/security/security.c
+++ b/security/security.c
@@ -89,20 +89,12 @@ __setup("security=", choose_lsm);
   * Return true if:
   *     -The passed LSM is the one chosen by user at boot time,
   *     -or the passed LSM is configured as the default and the user did not
- *      choose an alternate LSM at boot time,
- *     -or there is no default LSM set and the user didn't specify a
- *      specific LSM and we're the first to ask for registration permission,
- *     -or the passed LSM is currently loaded.
+ *      choose an alternate LSM at boot time.
   * Otherwise, return false.
   */
  int __init security_module_enable(struct security_operations *ops)
  {
-       if (!*chosen_lsm)
-               strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX);
-       else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX))
-               return 0;
-
-       return 1;
+       return !strcmp(ops->name, chosen_lsm);
  }
  
  /**
@@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource,
         return security_ops->task_setrlimit(p, resource, new_rlim);
  }
  
-int security_task_setscheduler(struct task_struct *p,
-                               int policy, struct sched_param *lp)
+int security_task_setscheduler(struct task_struct *p)
  {
-       return security_ops->task_setscheduler(p, policy, lp);
+       return security_ops->task_setscheduler(p);
  }
  
  int security_task_getscheduler(struct task_struct *p)
@@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk,
         security_ops->inet_conn_established(sk, skb);
  }
  
+int security_secmark_relabel_packet(u32 secid)
+{
+       return security_ops->secmark_relabel_packet(secid);
+}
+EXPORT_SYMBOL(security_secmark_relabel_packet);
+
+void security_secmark_refcount_inc(void)
+{
+       security_ops->secmark_refcount_inc();
+}
+EXPORT_SYMBOL(security_secmark_refcount_inc);
+
+void security_secmark_refcount_dec(void)
+{
+       security_ops->secmark_refcount_dec();
+}
+EXPORT_SYMBOL(security_secmark_refcount_dec);
+
  int security_tun_dev_create(void)
  {
         return security_ops->tun_dev_create();
diff --git a/security/selinux/Makefile b/security/selinux/Makefile

index 58d80f3..ad5cd76 100644 (file)
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -2,25 +2,20 @@
  # Makefile for building the SELinux module as part of the kernel tree.
  #
  
-obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
-
-selinux-y := avc.o \
-            hooks.o \
-            selinuxfs.o \
-            netlink.o \
-            nlmsgtab.o \
-            netif.o \
-            netnode.o \
-            netport.o \
-            exports.o
+obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
+
+selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
+            netnode.o netport.o exports.o \
+            ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
+            ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
  
  selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
  
  selinux-$(CONFIG_NETLABEL) += netlabel.o
  
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
+ccflags-y := -Isecurity/selinux -Isecurity/selinux/include
  
-$(obj)/avc.o: $(obj)/flask.h
+$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
  
  quiet_cmd_flask = GEN     $(obj)/flask.h $(obj)/av_permissions.h
        cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h
diff --git a/security/selinux/exports.c b/security/selinux/exports.c

index c0a454a..9066438 100644 (file)
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -11,58 +11,9 @@
   * it under the terms of the GNU General Public License version 2,
   * as published by the Free Software Foundation.
   */
-#include <linux/types.h>
-#include <linux/kernel.h>
  #include <linux/module.h>
-#include <linux/selinux.h>
-#include <linux/fs.h>
-#include <linux/ipc.h>
-#include <asm/atomic.h>
  
  #include "security.h"
-#include "objsec.h"
-
-/* SECMARK reference count */
-extern atomic_t selinux_secmark_refcount;
-
-int selinux_string_to_sid(char *str, u32 *sid)
-{
-       if (selinux_enabled)
-               return security_context_to_sid(str, strlen(str), sid);
-       else {
-               *sid = 0;
-               return 0;
-       }
-}
-EXPORT_SYMBOL_GPL(selinux_string_to_sid);
-
-int selinux_secmark_relabel_packet_permission(u32 sid)
-{
-       if (selinux_enabled) {
-               const struct task_security_struct *__tsec;
-               u32 tsid;
-
-               __tsec = current_security();
-               tsid = __tsec->sid;
-
-               return avc_has_perm(tsid, sid, SECCLASS_PACKET,
-                                   PACKET__RELABELTO, NULL);
-       }
-       return 0;
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
-
-void selinux_secmark_refcount_inc(void)
-{
-       atomic_inc(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
-
-void selinux_secmark_refcount_dec(void)
-{
-       atomic_dec(&selinux_secmark_refcount);
-}
-EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
  
  bool selinux_is_enabled(void)
  {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index 4796ddd..d9154cf 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
         return 0;
  }
  
-static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp)
+static int selinux_task_setscheduler(struct task_struct *p)
  {
         int rc;
  
-       rc = cap_task_setscheduler(p, policy, lp);
+       rc = cap_task_setscheduler(p);
         if (rc)
                 return rc;
  
@@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
         selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
  }
  
+static int selinux_secmark_relabel_packet(u32 sid)
+{
+       const struct task_security_struct *__tsec;
+       u32 tsid;
+
+       __tsec = current_security();
+       tsid = __tsec->sid;
+
+       return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL);
+}
+
+static void selinux_secmark_refcount_inc(void)
+{
+       atomic_inc(&selinux_secmark_refcount);
+}
+
+static void selinux_secmark_refcount_dec(void)
+{
+       atomic_dec(&selinux_secmark_refcount);
+}
+
  static void selinux_req_classify_flow(const struct request_sock *req,
                                       struct flowi *fl)
  {
@@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = {
         .inet_conn_request =            selinux_inet_conn_request,
         .inet_csk_clone =               selinux_inet_csk_clone,
         .inet_conn_established =        selinux_inet_conn_established,
+       .secmark_relabel_packet =       selinux_secmark_relabel_packet,
+       .secmark_refcount_inc =         selinux_secmark_refcount_inc,
+       .secmark_refcount_dec =         selinux_secmark_refcount_dec,
         .req_classify_flow =            selinux_req_classify_flow,
         .tun_dev_create =               selinux_tun_dev_create,
         .tun_dev_post_create =          selinux_tun_dev_post_create,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h

index b4c9eb4..8858d2b 100644 (file)
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = {
           { "compute_av", "compute_create", "compute_member",
             "check_context", "load_policy", "compute_relabel",
             "compute_user", "setenforce", "setbool", "setsecparam",
-           "setcheckreqprot", NULL } },
+           "setcheckreqprot", "read_policy", NULL } },
         { "process",
           { "fork", "transition", "sigchld", "sigkill",
             "sigstop", "signull", "signal", "ptrace", "getsched", "setsched",
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h

index 1f7c249..671273e 100644 (file)
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -9,6 +9,7 @@
  #define _SELINUX_SECURITY_H_
  
  #include <linux/magic.h>
+#include <linux/types.h>
  #include "flask.h"
  
  #define SECSID_NULL                    0x00000000 /* unspecified SID */
@@ -82,6 +83,8 @@ extern int selinux_policycap_openperm;
  int security_mls_enabled(void);
  
  int security_load_policy(void *data, size_t len);
+int security_read_policy(void **data, ssize_t *len);
+size_t security_policydb_len(void);
  
  int security_policycap_supported(unsigned int req_cap);
  
@@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid,
  
  const char *security_get_initial_sid_context(u32 sid);
  
+/*
+ * status notifier using mmap interface
+ */
+extern struct page *selinux_kernel_status_page(void);
+
+#define SELINUX_KERNEL_STATUS_VERSION  1
+struct selinux_kernel_status {
+       u32     version;        /* version number of thie structure */
+       u32     sequence;       /* sequence number of seqlock logic */
+       u32     enforcing;      /* current setting of enforcing mode */
+       u32     policyload;     /* times of policy reloaded */
+       u32     deny_unknown;   /* current setting of deny_unknown */
+       /*
+        * The version > 0 supports above members.
+        */
+} __attribute__((packed));
+
+extern void selinux_status_update_setenforce(int enforcing);
+extern void selinux_status_update_policyload(int seqno);
+
  #endif /* _SELINUX_SECURITY_H_ */
  
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c

index 79a1bb6..87e0556 100644 (file)
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -68,6 +68,8 @@ static int *bool_pending_values;
  static struct dentry *class_dir;
  static unsigned long last_class_ino;
  
+static char policy_opened;
+
  /* global data for policy capabilities */
  static struct dentry *policycap_dir;
  
@@ -110,6 +112,8 @@ enum sel_inos {
         SEL_COMPAT_NET, /* whether to use old compat network packet controls */
         SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */
         SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */
+       SEL_STATUS,     /* export current status using mmap() */
+       SEL_POLICY,     /* allow userspace to read the in kernel policy */
         SEL_INO_NEXT,   /* The next inode number to use */
  };
  
@@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
                 if (selinux_enforcing)
                         avc_ss_reset(0);
                 selnl_notify_setenforce(selinux_enforcing);
+               selinux_status_update_setenforce(selinux_enforcing);
         }
         length = count;
  out:
@@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = {
         .llseek         = generic_file_llseek,
  };
  
+static int sel_open_handle_status(struct inode *inode, struct file *filp)
+{
+       struct page    *status = selinux_kernel_status_page();
+
+       if (!status)
+               return -ENOMEM;
+
+       filp->private_data = status;
+
+       return 0;
+}
+
+static ssize_t sel_read_handle_status(struct file *filp, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+       struct page    *status = filp->private_data;
+
+       BUG_ON(!status);
+
+       return simple_read_from_buffer(buf, count, ppos,
+                                      page_address(status),
+                                      sizeof(struct selinux_kernel_status));
+}
+
+static int sel_mmap_handle_status(struct file *filp,
+                                 struct vm_area_struct *vma)
+{
+       struct page    *status = filp->private_data;
+       unsigned long   size = vma->vm_end - vma->vm_start;
+
+       BUG_ON(!status);
+
+       /* only allows one page from the head */
+       if (vma->vm_pgoff > 0 || size != PAGE_SIZE)
+               return -EIO;
+       /* disallow writable mapping */
+       if (vma->vm_flags & VM_WRITE)
+               return -EPERM;
+       /* disallow mprotect() turns it into writable */
+       vma->vm_flags &= ~VM_MAYWRITE;
+
+       return remap_pfn_range(vma, vma->vm_start,
+                              page_to_pfn(status),
+                              size, vma->vm_page_prot);
+}
+
+static const struct file_operations sel_handle_status_ops = {
+       .open           = sel_open_handle_status,
+       .read           = sel_read_handle_status,
+       .mmap           = sel_mmap_handle_status,
+       .llseek         = generic_file_llseek,
+};
+
  #ifdef CONFIG_SECURITY_SELINUX_DISABLE
  static ssize_t sel_write_disable(struct file *file, const char __user *buf,
                                  size_t count, loff_t *ppos)
@@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = {
         .llseek         = generic_file_llseek,
  };
  
+struct policy_load_memory {
+       size_t len;
+       void *data;
+};
+
+static int sel_open_policy(struct inode *inode, struct file *filp)
+{
+       struct policy_load_memory *plm = NULL;
+       int rc;
+
+       BUG_ON(filp->private_data);
+
+       mutex_lock(&sel_mutex);
+
+       rc = task_has_security(current, SECURITY__READ_POLICY);
+       if (rc)
+               goto err;
+
+       rc = -EBUSY;
+       if (policy_opened)
+               goto err;
+
+       rc = -ENOMEM;
+       plm = kzalloc(sizeof(*plm), GFP_KERNEL);
+       if (!plm)
+               goto err;
+
+       if (i_size_read(inode) != security_policydb_len()) {
+               mutex_lock(&inode->i_mutex);
+               i_size_write(inode, security_policydb_len());
+               mutex_unlock(&inode->i_mutex);
+       }
+
+       rc = security_read_policy(&plm->data, &plm->len);
+       if (rc)
+               goto err;
+
+       policy_opened = 1;
+
+       filp->private_data = plm;
+
+       mutex_unlock(&sel_mutex);
+
+       return 0;
+err:
+       mutex_unlock(&sel_mutex);
+
+       if (plm)
+               vfree(plm->data);
+       kfree(plm);
+       return rc;
+}
+
+static int sel_release_policy(struct inode *inode, struct file *filp)
+{
+       struct policy_load_memory *plm = filp->private_data;
+
+       BUG_ON(!plm);
+
+       policy_opened = 0;
+
+       vfree(plm->data);
+       kfree(plm);
+
+       return 0;
+}
+
+static ssize_t sel_read_policy(struct file *filp, char __user *buf,
+                              size_t count, loff_t *ppos)
+{
+       struct policy_load_memory *plm = filp->private_data;
+       int ret;
+
+       mutex_lock(&sel_mutex);
+
+       ret = task_has_security(current, SECURITY__READ_POLICY);
+       if (ret)
+               goto out;
+
+       ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len);
+out:
+       mutex_unlock(&sel_mutex);
+       return ret;
+}
+
+static int sel_mmap_policy_fault(struct vm_area_struct *vma,
+                                struct vm_fault *vmf)
+{
+       struct policy_load_memory *plm = vma->vm_file->private_data;
+       unsigned long offset;
+       struct page *page;
+
+       if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE))
+               return VM_FAULT_SIGBUS;
+
+       offset = vmf->pgoff << PAGE_SHIFT;
+       if (offset >= roundup(plm->len, PAGE_SIZE))
+               return VM_FAULT_SIGBUS;
+
+       page = vmalloc_to_page(plm->data + offset);
+       get_page(page);
+
+       vmf->page = page;
+
+       return 0;
+}
+
+static struct vm_operations_struct sel_mmap_policy_ops = {
+       .fault = sel_mmap_policy_fault,
+       .page_mkwrite = sel_mmap_policy_fault,
+};
+
+int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED) {
+               /* do not allow mprotect to make mapping writable */
+               vma->vm_flags &= ~VM_MAYWRITE;
+
+               if (vma->vm_flags & VM_WRITE)
+                       return -EACCES;
+       }
+
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &sel_mmap_policy_ops;
+
+       return 0;
+}
+
+static const struct file_operations sel_policy_ops = {
+       .open           = sel_open_policy,
+       .read           = sel_read_policy,
+       .mmap           = sel_mmap_policy,
+       .release        = sel_release_policy,
+};
+
  static ssize_t sel_write_load(struct file *file, const char __user *buf,
                               size_t count, loff_t *ppos)
  
@@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
                 [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR},
                 [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO},
                 [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO},
+               [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO},
+               [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR},
                 /* last one */ {""}
         };
         ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile

deleted file mode 100644 (file)

index 15d4e62..0000000
--- a/security/selinux/ss/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for building the SELinux security server as part of the kernel tree.
-#
-
-EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
-obj-y := ss.o
-
-ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o
-
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c

index 929480c..a3dd9fa 100644 (file)
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules)
         if (shift > 2)
                 shift = shift - 2;
         nslot = 1 << shift;
-       if (nslot > MAX_AVTAB_SIZE)
-               nslot = MAX_AVTAB_SIZE;
+       if (nslot > MAX_AVTAB_HASH_BUCKETS)
+               nslot = MAX_AVTAB_HASH_BUCKETS;
         mask = nslot - 1;
  
         h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL);
@@ -501,6 +501,48 @@ bad:
         goto out;
  }
  
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
+{
+       __le16 buf16[4];
+       __le32 buf32[1];
+       int rc;
+
+       buf16[0] = cpu_to_le16(cur->key.source_type);
+       buf16[1] = cpu_to_le16(cur->key.target_type);
+       buf16[2] = cpu_to_le16(cur->key.target_class);
+       buf16[3] = cpu_to_le16(cur->key.specified);
+       rc = put_entry(buf16, sizeof(u16), 4, fp);
+       if (rc)
+               return rc;
+       buf32[0] = cpu_to_le32(cur->datum.data);
+       rc = put_entry(buf32, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       return 0;
+}
+
+int avtab_write(struct policydb *p, struct avtab *a, void *fp)
+{
+       unsigned int i;
+       int rc = 0;
+       struct avtab_node *cur;
+       __le32 buf[1];
+
+       buf[0] = cpu_to_le32(a->nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (i = 0; i < a->nslot; i++) {
+               for (cur = a->htable[i]; cur; cur = cur->next) {
+                       rc = avtab_write_item(p, cur, fp);
+                       if (rc)
+                               return rc;
+               }
+       }
+
+       return rc;
+}
  void avtab_cache_init(void)
  {
         avtab_node_cachep = kmem_cache_create("avtab_node",
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h

index cd4f734..dff0c75 100644 (file)
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
                     void *p);
  
  int avtab_read(struct avtab *a, void *fp, struct policydb *pol);
+int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp);
+int avtab_write(struct policydb *p, struct avtab *a, void *fp);
  
  struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key,
                                           struct avtab_datum *datum);
@@ -85,7 +87,6 @@ void avtab_cache_destroy(void);
  #define MAX_AVTAB_HASH_BITS 11
  #define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS)
  #define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1)
-#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS
  
  #endif /* _SS_AVTAB_H_ */
  
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c

index c91e150..655fe1c 100644 (file)
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -490,6 +490,129 @@ err:
         return rc;
  }
  
+int cond_write_bool(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct cond_bool_datum *booldatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       u32 len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(booldatum->value);
+       buf[1] = cpu_to_le32(booldatum->state);
+       buf[2] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+       return 0;
+}
+
+/*
+ * cond_write_cond_av_list doesn't write out the av_list nodes.
+ * Instead it writes out the key/value pairs from the avtab. This
+ * is necessary because there is no way to uniquely identifying rules
+ * in the avtab so it is not possible to associate individual rules
+ * in the avtab with a conditional without saving them as part of
+ * the conditional. This means that the avtab with the conditional
+ * rules will not be saved but will be rebuilt on policy load.
+ */
+static int cond_write_av_list(struct policydb *p,
+                             struct cond_av_list *list, struct policy_file *fp)
+{
+       __le32 buf[1];
+       struct cond_av_list *cur_list;
+       u32 len;
+       int rc;
+
+       len = 0;
+       for (cur_list = list; cur_list != NULL; cur_list = cur_list->next)
+               len++;
+
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       if (len == 0)
+               return 0;
+
+       for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) {
+               rc = avtab_write_item(p, cur_list->node, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+int cond_write_node(struct policydb *p, struct cond_node *node,
+                   struct policy_file *fp)
+{
+       struct cond_expr *cur_expr;
+       __le32 buf[2];
+       int rc;
+       u32 len = 0;
+
+       buf[0] = cpu_to_le32(node->cur_state);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next)
+               len++;
+
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) {
+               buf[0] = cpu_to_le32(cur_expr->expr_type);
+               buf[1] = cpu_to_le32(cur_expr->bool);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+       }
+
+       rc = cond_write_av_list(p, node->true_list, fp);
+       if (rc)
+               return rc;
+       rc = cond_write_av_list(p, node->false_list, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
+{
+       struct cond_node *cur;
+       u32 len;
+       __le32 buf[1];
+       int rc;
+
+       len = 0;
+       for (cur = list; cur != NULL; cur = cur->next)
+               len++;
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       for (cur = list; cur != NULL; cur = cur->next) {
+               rc = cond_write_node(p, cur, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
  /* Determine whether additional permissions are granted by the conditional
   * av table, and if so, add them to the result
   */
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h

index 53ddb01..3f209c6 100644 (file)
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap);
  
  int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp);
  int cond_read_list(struct policydb *p, void *fp);
+int cond_write_bool(void *key, void *datum, void *ptr);
+int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
  
  void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
  
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c

index 04b6145..d42951f 100644 (file)
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -22,6 +22,8 @@
  #include "ebitmap.h"
  #include "policydb.h"
  
+#define BITS_PER_U64   (sizeof(u64) * 8)
+
  int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
  {
         struct ebitmap_node *n1, *n2;
@@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp)
         e->highbit = le32_to_cpu(buf[1]);
         count = le32_to_cpu(buf[2]);
  
-       if (mapunit != sizeof(u64) * 8) {
+       if (mapunit != BITS_PER_U64) {
                 printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
                        "match my size %Zd (high bit was %d)\n",
-                      mapunit, sizeof(u64) * 8, e->highbit);
+                      mapunit, BITS_PER_U64, e->highbit);
                 goto bad;
         }
  
@@ -446,3 +448,78 @@ bad:
         ebitmap_destroy(e);
         goto out;
  }
+
+int ebitmap_write(struct ebitmap *e, void *fp)
+{
+       struct ebitmap_node *n;
+       u32 count;
+       __le32 buf[3];
+       u64 map;
+       int bit, last_bit, last_startbit, rc;
+
+       buf[0] = cpu_to_le32(BITS_PER_U64);
+
+       count = 0;
+       last_bit = 0;
+       last_startbit = -1;
+       ebitmap_for_each_positive_bit(e, n, bit) {
+               if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+                       count++;
+                       last_startbit = rounddown(bit, BITS_PER_U64);
+               }
+               last_bit = roundup(bit + 1, BITS_PER_U64);
+       }
+       buf[1] = cpu_to_le32(last_bit);
+       buf[2] = cpu_to_le32(count);
+
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       map = 0;
+       last_startbit = INT_MIN;
+       ebitmap_for_each_positive_bit(e, n, bit) {
+               if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
+                       __le64 buf64[1];
+
+                       /* this is the very first bit */
+                       if (!map) {
+                               last_startbit = rounddown(bit, BITS_PER_U64);
+                               map = (u64)1 << (bit - last_startbit);
+                               continue;
+                       }
+
+                       /* write the last node */
+                       buf[0] = cpu_to_le32(last_startbit);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+
+                       buf64[0] = cpu_to_le64(map);
+                       rc = put_entry(buf64, sizeof(u64), 1, fp);
+                       if (rc)
+                               return rc;
+
+                       /* set up for the next node */
+                       map = 0;
+                       last_startbit = rounddown(bit, BITS_PER_U64);
+               }
+               map |= (u64)1 << (bit - last_startbit);
+       }
+       /* write the last node */
+       if (map) {
+               __le64 buf64[1];
+
+               /* write the last node */
+               buf[0] = cpu_to_le32(last_startbit);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+
+               buf64[0] = cpu_to_le64(map);
+               rc = put_entry(buf64, sizeof(u64), 1, fp);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h

index f283b43..1f4e93c 100644 (file)
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
  int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
  void ebitmap_destroy(struct ebitmap *e);
  int ebitmap_read(struct ebitmap *e, void *fp);
+int ebitmap_write(struct ebitmap *e, void *fp);
  
  #ifdef CONFIG_NETLABEL
  int ebitmap_netlbl_export(struct ebitmap *ebmap,
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c

index 3a29704..94f630d 100644 (file)
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -37,6 +37,7 @@
  #include "policydb.h"
  #include "conditional.h"
  #include "mls.h"
+#include "services.h"
  
  #define _DEBUG_HASHES
  
@@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k)
  static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
  {
         const struct range_trans *key1 = k1, *key2 = k2;
-       return (key1->source_type != key2->source_type ||
-               key1->target_type != key2->target_type ||
-               key1->target_class != key2->target_class);
+       int v;
+
+       v = key1->source_type - key2->source_type;
+       if (v)
+               return v;
+
+       v = key1->target_type - key2->target_type;
+       if (v)
+               return v;
+
+       v = key1->target_class - key2->target_class;
+
+       return v;
  }
  
  /*
@@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap)
  
  static int type_bounds_sanity_check(void *key, void *datum, void *datap)
  {
-       struct type_datum *upper, *type;
+       struct type_datum *upper;
         struct policydb *p = datap;
         int depth = 0;
  
-       upper = type = datum;
+       upper = datum;
         while (upper->bounds) {
                 if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                         printk(KERN_ERR "SELinux: type %s: "
@@ -2306,3 +2317,843 @@ bad:
         policydb_destroy(p);
         goto out;
  }
+
+/*
+ * Write a MLS level structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_level(struct mls_level *l, void *fp)
+{
+       __le32 buf[1];
+       int rc;
+
+       buf[0] = cpu_to_le32(l->sens);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&l->cat, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * Write a MLS range structure to a policydb binary
+ * representation file.
+ */
+static int mls_write_range_helper(struct mls_range *r, void *fp)
+{
+       __le32 buf[3];
+       size_t items;
+       int rc, eq;
+
+       eq = mls_level_eq(&r->level[1], &r->level[0]);
+
+       if (eq)
+               items = 2;
+       else
+               items = 3;
+       buf[0] = cpu_to_le32(items-1);
+       buf[1] = cpu_to_le32(r->level[0].sens);
+       if (!eq)
+               buf[2] = cpu_to_le32(r->level[1].sens);
+
+       BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&r->level[0].cat, fp);
+       if (rc)
+               return rc;
+       if (!eq) {
+               rc = ebitmap_write(&r->level[1].cat, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+static int sens_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct level_datum *levdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[2];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(levdatum->isalias);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_level(levdatum->level, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int cat_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct cat_datum *catdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(catdatum->value);
+       buf[2] = cpu_to_le32(catdatum->isalias);
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int role_trans_write(struct role_trans *r, void *fp)
+{
+       struct role_trans *tr;
+       u32 buf[3];
+       size_t nel;
+       int rc;
+
+       nel = 0;
+       for (tr = r; tr; tr = tr->next)
+               nel++;
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (tr = r; tr; tr = tr->next) {
+               buf[0] = cpu_to_le32(tr->role);
+               buf[1] = cpu_to_le32(tr->type);
+               buf[2] = cpu_to_le32(tr->new_role);
+               rc = put_entry(buf, sizeof(u32), 3, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+static int role_allow_write(struct role_allow *r, void *fp)
+{
+       struct role_allow *ra;
+       u32 buf[2];
+       size_t nel;
+       int rc;
+
+       nel = 0;
+       for (ra = r; ra; ra = ra->next)
+               nel++;
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (ra = r; ra; ra = ra->next) {
+               buf[0] = cpu_to_le32(ra->role);
+               buf[1] = cpu_to_le32(ra->new_role);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+/*
+ * Write a security context structure
+ * to a policydb binary representation file.
+ */
+static int context_write(struct policydb *p, struct context *c,
+                        void *fp)
+{
+       int rc;
+       __le32 buf[3];
+
+       buf[0] = cpu_to_le32(c->user);
+       buf[1] = cpu_to_le32(c->role);
+       buf[2] = cpu_to_le32(c->type);
+
+       rc = put_entry(buf, sizeof(u32), 3, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_range_helper(&c->range, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * The following *_write functions are used to
+ * write the symbol data to a policy database
+ * binary representation file.
+ */
+
+static int perm_write(void *vkey, void *datum, void *fp)
+{
+       char *key = vkey;
+       struct perm_datum *perdatum = datum;
+       __le32 buf[2];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(perdatum->value);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int common_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct common_datum *comdatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       __le32 buf[4];
+       size_t len;
+       int rc;
+
+       len = strlen(key);
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(comdatum->value);
+       buf[2] = cpu_to_le32(comdatum->permissions.nprim);
+       buf[3] = cpu_to_le32(comdatum->permissions.table->nel);
+       rc = put_entry(buf, sizeof(u32), 4, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = hashtab_map(comdatum->permissions.table, perm_write, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int write_cons_helper(struct policydb *p, struct constraint_node *node,
+                            void *fp)
+{
+       struct constraint_node *c;
+       struct constraint_expr *e;
+       __le32 buf[3];
+       u32 nel;
+       int rc;
+
+       for (c = node; c; c = c->next) {
+               nel = 0;
+               for (e = c->expr; e; e = e->next)
+                       nel++;
+               buf[0] = cpu_to_le32(c->permissions);
+               buf[1] = cpu_to_le32(nel);
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+               for (e = c->expr; e; e = e->next) {
+                       buf[0] = cpu_to_le32(e->expr_type);
+                       buf[1] = cpu_to_le32(e->attr);
+                       buf[2] = cpu_to_le32(e->op);
+                       rc = put_entry(buf, sizeof(u32), 3, fp);
+                       if (rc)
+                               return rc;
+
+                       switch (e->expr_type) {
+                       case CEXPR_NAMES:
+                               rc = ebitmap_write(&e->names, fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+static int class_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct class_datum *cladatum = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       struct constraint_node *c;
+       __le32 buf[6];
+       u32 ncons;
+       size_t len, len2;
+       int rc;
+
+       len = strlen(key);
+       if (cladatum->comkey)
+               len2 = strlen(cladatum->comkey);
+       else
+               len2 = 0;
+
+       ncons = 0;
+       for (c = cladatum->constraints; c; c = c->next)
+               ncons++;
+
+       buf[0] = cpu_to_le32(len);
+       buf[1] = cpu_to_le32(len2);
+       buf[2] = cpu_to_le32(cladatum->value);
+       buf[3] = cpu_to_le32(cladatum->permissions.nprim);
+       if (cladatum->permissions.table)
+               buf[4] = cpu_to_le32(cladatum->permissions.table->nel);
+       else
+               buf[4] = 0;
+       buf[5] = cpu_to_le32(ncons);
+       rc = put_entry(buf, sizeof(u32), 6, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       if (cladatum->comkey) {
+               rc = put_entry(cladatum->comkey, 1, len2, fp);
+               if (rc)
+                       return rc;
+       }
+
+       rc = hashtab_map(cladatum->permissions.table, perm_write, fp);
+       if (rc)
+               return rc;
+
+       rc = write_cons_helper(p, cladatum->constraints, fp);
+       if (rc)
+               return rc;
+
+       /* write out the validatetrans rule */
+       ncons = 0;
+       for (c = cladatum->validatetrans; c; c = c->next)
+               ncons++;
+
+       buf[0] = cpu_to_le32(ncons);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       rc = write_cons_helper(p, cladatum->validatetrans, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int role_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct role_datum *role = datum;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       __le32 buf[3];
+       size_t items, len;
+       int rc;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(role->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+               buf[items++] = cpu_to_le32(role->bounds);
+
+       BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
+
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&role->dominates, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&role->types, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int type_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct type_datum *typdatum = datum;
+       struct policy_data *pd = ptr;
+       struct policydb *p = pd->p;
+       void *fp = pd->fp;
+       __le32 buf[4];
+       int rc;
+       size_t items, len;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(typdatum->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
+               u32 properties = 0;
+
+               if (typdatum->primary)
+                       properties |= TYPEDATUM_PROPERTY_PRIMARY;
+
+               if (typdatum->attribute)
+                       properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;
+
+               buf[items++] = cpu_to_le32(properties);
+               buf[items++] = cpu_to_le32(typdatum->bounds);
+       } else {
+               buf[items++] = cpu_to_le32(typdatum->primary);
+       }
+       BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int user_write(void *vkey, void *datum, void *ptr)
+{
+       char *key = vkey;
+       struct user_datum *usrdatum = datum;
+       struct policy_data *pd = ptr;
+       struct policydb *p = pd->p;
+       void *fp = pd->fp;
+       __le32 buf[3];
+       size_t items, len;
+       int rc;
+
+       len = strlen(key);
+       items = 0;
+       buf[items++] = cpu_to_le32(len);
+       buf[items++] = cpu_to_le32(usrdatum->value);
+       if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
+               buf[items++] = cpu_to_le32(usrdatum->bounds);
+       BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
+       rc = put_entry(buf, sizeof(u32), items, fp);
+       if (rc)
+               return rc;
+
+       rc = put_entry(key, 1, len, fp);
+       if (rc)
+               return rc;
+
+       rc = ebitmap_write(&usrdatum->roles, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_range_helper(&usrdatum->range, fp);
+       if (rc)
+               return rc;
+
+       rc = mls_write_level(&usrdatum->dfltlevel, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int (*write_f[SYM_NUM]) (void *key, void *datum,
+                               void *datap) =
+{
+       common_write,
+       class_write,
+       role_write,
+       type_write,
+       user_write,
+       cond_write_bool,
+       sens_write,
+       cat_write,
+};
+
+static int ocontext_write(struct policydb *p, struct policydb_compat_info *info,
+                         void *fp)
+{
+       unsigned int i, j, rc;
+       size_t nel, len;
+       __le32 buf[3];
+       u32 nodebuf[8];
+       struct ocontext *c;
+       for (i = 0; i < info->ocon_num; i++) {
+               nel = 0;
+               for (c = p->ocontexts[i]; c; c = c->next)
+                       nel++;
+               buf[0] = cpu_to_le32(nel);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               for (c = p->ocontexts[i]; c; c = c->next) {
+                       switch (i) {
+                       case OCON_ISID:
+                               buf[0] = cpu_to_le32(c->sid[0]);
+                               rc = put_entry(buf, sizeof(u32), 1, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_FS:
+                       case OCON_NETIF:
+                               len = strlen(c->u.name);
+                               buf[0] = cpu_to_le32(len);
+                               rc = put_entry(buf, sizeof(u32), 1, fp);
+                               if (rc)
+                                       return rc;
+                               rc = put_entry(c->u.name, 1, len, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[1], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_PORT:
+                               buf[0] = cpu_to_le32(c->u.port.protocol);
+                               buf[1] = cpu_to_le32(c->u.port.low_port);
+                               buf[2] = cpu_to_le32(c->u.port.high_port);
+                               rc = put_entry(buf, sizeof(u32), 3, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_NODE:
+                               nodebuf[0] = c->u.node.addr; /* network order */
+                               nodebuf[1] = c->u.node.mask; /* network order */
+                               rc = put_entry(nodebuf, sizeof(u32), 2, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_FSUSE:
+                               buf[0] = cpu_to_le32(c->v.behavior);
+                               len = strlen(c->u.name);
+                               buf[1] = cpu_to_le32(len);
+                               rc = put_entry(buf, sizeof(u32), 2, fp);
+                               if (rc)
+                                       return rc;
+                               rc = put_entry(c->u.name, 1, len, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       case OCON_NODE6:
+                               for (j = 0; j < 4; j++)
+                                       nodebuf[j] = c->u.node6.addr[j]; /* network order */
+                               for (j = 0; j < 4; j++)
+                                       nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */
+                               rc = put_entry(nodebuf, sizeof(u32), 8, fp);
+                               if (rc)
+                                       return rc;
+                               rc = context_write(p, &c->context[0], fp);
+                               if (rc)
+                                       return rc;
+                               break;
+                       }
+               }
+       }
+       return 0;
+}
+
+static int genfs_write(struct policydb *p, void *fp)
+{
+       struct genfs *genfs;
+       struct ocontext *c;
+       size_t len;
+       __le32 buf[1];
+       int rc;
+
+       len = 0;
+       for (genfs = p->genfs; genfs; genfs = genfs->next)
+               len++;
+       buf[0] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+       for (genfs = p->genfs; genfs; genfs = genfs->next) {
+               len = strlen(genfs->fstype);
+               buf[0] = cpu_to_le32(len);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               rc = put_entry(genfs->fstype, 1, len, fp);
+               if (rc)
+                       return rc;
+               len = 0;
+               for (c = genfs->head; c; c = c->next)
+                       len++;
+               buf[0] = cpu_to_le32(len);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+               for (c = genfs->head; c; c = c->next) {
+                       len = strlen(c->u.name);
+                       buf[0] = cpu_to_le32(len);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+                       rc = put_entry(c->u.name, 1, len, fp);
+                       if (rc)
+                               return rc;
+                       buf[0] = cpu_to_le32(c->v.sclass);
+                       rc = put_entry(buf, sizeof(u32), 1, fp);
+                       if (rc)
+                               return rc;
+                       rc = context_write(p, &c->context[0], fp);
+                       if (rc)
+                               return rc;
+               }
+       }
+       return 0;
+}
+
+static int range_count(void *key, void *data, void *ptr)
+{
+       int *cnt = ptr;
+       *cnt = *cnt + 1;
+
+       return 0;
+}
+
+static int range_write_helper(void *key, void *data, void *ptr)
+{
+       __le32 buf[2];
+       struct range_trans *rt = key;
+       struct mls_range *r = data;
+       struct policy_data *pd = ptr;
+       void *fp = pd->fp;
+       struct policydb *p = pd->p;
+       int rc;
+
+       buf[0] = cpu_to_le32(rt->source_type);
+       buf[1] = cpu_to_le32(rt->target_type);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+       if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
+               buf[0] = cpu_to_le32(rt->target_class);
+               rc = put_entry(buf, sizeof(u32), 1, fp);
+               if (rc)
+                       return rc;
+       }
+       rc = mls_write_range_helper(r, fp);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+static int range_write(struct policydb *p, void *fp)
+{
+       size_t nel;
+       __le32 buf[1];
+       int rc;
+       struct policy_data pd;
+
+       pd.p = p;
+       pd.fp = fp;
+
+       /* count the number of entries in the hashtab */
+       nel = 0;
+       rc = hashtab_map(p->range_tr, range_count, &nel);
+       if (rc)
+               return rc;
+
+       buf[0] = cpu_to_le32(nel);
+       rc = put_entry(buf, sizeof(u32), 1, fp);
+       if (rc)
+               return rc;
+
+       /* actually write all of the entries */
+       rc = hashtab_map(p->range_tr, range_write_helper, &pd);
+       if (rc)
+               return rc;
+
+       return 0;
+}
+
+/*
+ * Write the configuration data in a policy database
+ * structure to a policy database binary representation
+ * file.
+ */
+int policydb_write(struct policydb *p, void *fp)
+{
+       unsigned int i, num_syms;
+       int rc;
+       __le32 buf[4];
+       u32 config;
+       size_t len;
+       struct policydb_compat_info *info;
+
+       /*
+        * refuse to write policy older than compressed avtab
+        * to simplify the writer.  There are other tests dropped
+        * since we assume this throughout the writer code.  Be
+        * careful if you ever try to remove this restriction
+        */
+       if (p->policyvers < POLICYDB_VERSION_AVTAB) {
+               printk(KERN_ERR "SELinux: refusing to write policy version %d."
+                      "  Because it is less than version %d\n", p->policyvers,
+                      POLICYDB_VERSION_AVTAB);
+               return -EINVAL;
+       }
+
+       config = 0;
+       if (p->mls_enabled)
+               config |= POLICYDB_CONFIG_MLS;
+
+       if (p->reject_unknown)
+               config |= REJECT_UNKNOWN;
+       if (p->allow_unknown)
+               config |= ALLOW_UNKNOWN;
+
+       /* Write the magic number and string identifiers. */
+       buf[0] = cpu_to_le32(POLICYDB_MAGIC);
+       len = strlen(POLICYDB_STRING);
+       buf[1] = cpu_to_le32(len);
+       rc = put_entry(buf, sizeof(u32), 2, fp);
+       if (rc)
+               return rc;
+       rc = put_entry(POLICYDB_STRING, 1, len, fp);
+       if (rc)
+               return rc;
+
+       /* Write the version, config, and table sizes. */
+       info = policydb_lookup_compat(p->policyvers);
+       if (!info) {
+               printk(KERN_ERR "SELinux: compatibility lookup failed for policy "
+                   "version %d", p->policyvers);
+               return rc;
+       }
+
+       buf[0] = cpu_to_le32(p->policyvers);
+       buf[1] = cpu_to_le32(config);
+       buf[2] = cpu_to_le32(info->sym_num);
+       buf[3] = cpu_to_le32(info->ocon_num);
+
+       rc = put_entry(buf, sizeof(u32), 4, fp);
+       if (rc)
+               return rc;
+
+       if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
+               rc = ebitmap_write(&p->policycaps, fp);
+               if (rc)
+                       return rc;
+       }
+
+       if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
+               rc = ebitmap_write(&p->permissive_map, fp);
+               if (rc)
+                       return rc;
+       }
+
+       num_syms = info->sym_num;
+       for (i = 0; i < num_syms; i++) {
+               struct policy_data pd;
+
+               pd.fp = fp;
+               pd.p = p;
+
+               buf[0] = cpu_to_le32(p->symtab[i].nprim);
+               buf[1] = cpu_to_le32(p->symtab[i].table->nel);
+
+               rc = put_entry(buf, sizeof(u32), 2, fp);
+               if (rc)
+                       return rc;
+               rc = hashtab_map(p->symtab[i].table, write_f[i], &pd);
+               if (rc)
+                       return rc;
+       }
+
+       rc = avtab_write(p, &p->te_avtab, fp);
+       if (rc)
+               return rc;
+
+       rc = cond_write_list(p, p->cond_list, fp);
+       if (rc)
+               return rc;
+
+       rc = role_trans_write(p->role_tr, fp);
+       if (rc)
+               return rc;
+
+       rc = role_allow_write(p->role_allow, fp);
+       if (rc)
+               return rc;
+
+       rc = ocontext_write(p, info, fp);
+       if (rc)
+               return rc;
+
+       rc = genfs_write(p, fp);
+       if (rc)
+               return rc;
+
+       rc = range_write(p, fp);
+       if (rc)
+               return rc;
+
+       for (i = 0; i < p->p_types.nprim; i++) {
+               struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
+
+               BUG_ON(!e);
+               rc = ebitmap_write(e, fp);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h

index 310e944..95d3d7d 100644 (file)
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -254,6 +254,9 @@ struct policydb {
  
         struct ebitmap permissive_map;
  
+       /* length of this policy when it was loaded */
+       size_t len;
+
         unsigned int policyvers;
  
         unsigned int reject_unknown : 1;
@@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
  extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
  extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
  extern int policydb_read(struct policydb *p, void *fp);
+extern int policydb_write(struct policydb *p, void *fp);
  
  #define PERM_SYMTAB_SIZE 32
  
@@ -290,6 +294,11 @@ struct policy_file {
         size_t len;
  };
  
+struct policy_data {
+       struct policydb *p;
+       void *fp;
+};
+
  static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
  {
         if (bytes > fp->len)
@@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
         return 0;
  }
  
+static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp)
+{
+       size_t len = bytes * num;
+
+       memcpy(fp->data, buf, len);
+       fp->data += len;
+       fp->len -= len;
+
+       return 0;
+}
+
  extern u16 string_to_security_class(struct policydb *p, const char *name);
  extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);
  
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c

index 9ea2fec..223c1ff 100644 (file)
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -51,6 +51,7 @@
  #include <linux/mutex.h>
  #include <linux/selinux.h>
  #include <linux/flex_array.h>
+#include <linux/vmalloc.h>
  #include <net/netlabel.h>
  
  #include "flask.h"
@@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
  {
         char *scontextp;
  
-       *scontext = NULL;
+       if (scontext)
+               *scontext = NULL;
         *scontext_len = 0;
  
         if (context->len) {
@@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
         *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1;
         *scontext_len += mls_compute_context_len(context);
  
+       if (!scontext)
+               return 0;
+
         /* Allocate space for the context; caller must free this space. */
         scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
         if (!scontextp)
@@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
         struct context *context;
         int rc = 0;
  
-       *scontext = NULL;
+       if (scontext)
+               *scontext = NULL;
         *scontext_len  = 0;
  
         if (!ss_initialized) {
@@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
                         char *scontextp;
  
                         *scontext_len = strlen(initial_sid_to_string[sid]) + 1;
+                       if (!scontext)
+                               goto out;
                         scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
                         if (!scontextp) {
                                 rc = -ENOMEM;
@@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len)
                         return rc;
                 }
  
+               policydb.len = len;
                 rc = selinux_set_mapping(&policydb, secclass_map,
                                          &current_mapping,
                                          &current_mapping_size);
@@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len)
                 selinux_complete_init();
                 avc_ss_reset(seqno);
                 selnl_notify_policyload(seqno);
+               selinux_status_update_policyload(seqno);
                 selinux_netlbl_cache_invalidate();
                 selinux_xfrm_notify_policyload();
                 return 0;
@@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len)
         if (rc)
                 return rc;
  
+       newpolicydb.len = len;
         /* If switching between different policy types, log MLS status */
         if (policydb.mls_enabled && !newpolicydb.mls_enabled)
                 printk(KERN_INFO "SELinux: Disabling MLS support...\n");
@@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len)
  
         avc_ss_reset(seqno);
         selnl_notify_policyload(seqno);
+       selinux_status_update_policyload(seqno);
         selinux_netlbl_cache_invalidate();
         selinux_xfrm_notify_policyload();
  
@@ -1883,6 +1895,17 @@ err:
  
  }
  
+size_t security_policydb_len(void)
+{
+       size_t len;
+
+       read_lock(&policy_rwlock);
+       len = policydb.len;
+       read_unlock(&policy_rwlock);
+
+       return len;
+}
+
  /**
   * security_port_sid - Obtain the SID for a port.
   * @protocol: protocol number
@@ -2374,6 +2397,7 @@ out:
         if (!rc) {
                 avc_ss_reset(seqno);
                 selnl_notify_policyload(seqno);
+               selinux_status_update_policyload(seqno);
                 selinux_xfrm_notify_policyload();
         }
         return rc;
@@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure:
         return rc;
  }
  #endif /* CONFIG_NETLABEL */
+
+/**
+ * security_read_policy - read the policy.
+ * @data: binary policy data
+ * @len: length of data in bytes
+ *
+ */
+int security_read_policy(void **data, ssize_t *len)
+{
+       int rc;
+       struct policy_file fp;
+
+       if (!ss_initialized)
+               return -EINVAL;
+
+       *len = security_policydb_len();
+
+       *data = vmalloc_user(*len);
+       if (!*data)
+               return -ENOMEM;
+
+       fp.data = *data;
+       fp.len = *len;
+
+       read_lock(&policy_rwlock);
+       rc = policydb_write(&policydb, &fp);
+       read_unlock(&policy_rwlock);
+
+       if (rc)
+               return rc;
+
+       *len = (unsigned long)fp.data - (unsigned long)*data;
+       return 0;
+
+}
diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c

new file mode 100644 (file)

index 0000000..d982365
--- /dev/null
+++ b/security/selinux/ss/status.c
@@ -0,0 +1,126 @@
+/*
+ * mmap based event notifications for SELinux
+ *
+ * Author: KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * Copyright (C) 2010 NEC corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include "avc.h"
+#include "services.h"
+
+/*
+ * The selinux_status_page shall be exposed to userspace applications
+ * using mmap interface on /selinux/status.
+ * It enables to notify applications a few events that will cause reset
+ * of userspace access vector without context switching.
+ *
+ * The selinux_kernel_status structure on the head of status page is
+ * protected from concurrent accesses using seqlock logic, so userspace
+ * application should reference the status page according to the seqlock
+ * logic.
+ *
+ * Typically, application checks status->sequence at the head of access
+ * control routine. If it is odd-number, kernel is updating the status,
+ * so please wait for a moment. If it is changed from the last sequence
+ * number, it means something happen, so application will reset userspace
+ * avc, if needed.
+ * In most cases, application shall confirm the kernel status is not
+ * changed without any system call invocations.
+ */
+static struct page *selinux_status_page;
+static DEFINE_MUTEX(selinux_status_lock);
+
+/*
+ * selinux_kernel_status_page
+ *
+ * It returns a reference to selinux_status_page. If the status page is
+ * not allocated yet, it also tries to allocate it at the first time.
+ */
+struct page *selinux_kernel_status_page(void)
+{
+       struct selinux_kernel_status   *status;
+       struct page                    *result = NULL;
+
+       mutex_lock(&selinux_status_lock);
+       if (!selinux_status_page) {
+               selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+
+               if (selinux_status_page) {
+                       status = page_address(selinux_status_page);
+
+                       status->version = SELINUX_KERNEL_STATUS_VERSION;
+                       status->sequence = 0;
+                       status->enforcing = selinux_enforcing;
+                       /*
+                        * NOTE: the next policyload event shall set
+                        * a positive value on the status->policyload,
+                        * although it may not be 1, but never zero.
+                        * So, application can know it was updated.
+                        */
+                       status->policyload = 0;
+                       status->deny_unknown = !security_get_allow_unknown();
+               }
+       }
+       result = selinux_status_page;
+       mutex_unlock(&selinux_status_lock);
+
+       return result;
+}
+
+/*
+ * selinux_status_update_setenforce
+ *
+ * It updates status of the current enforcing/permissive mode.
+ */
+void selinux_status_update_setenforce(int enforcing)
+{
+       struct selinux_kernel_status   *status;
+
+       mutex_lock(&selinux_status_lock);
+       if (selinux_status_page) {
+               status = page_address(selinux_status_page);
+
+               status->sequence++;
+               smp_wmb();
+
+               status->enforcing = enforcing;
+
+               smp_wmb();
+               status->sequence++;
+       }
+       mutex_unlock(&selinux_status_lock);
+}
+
+/*
+ * selinux_status_update_policyload
+ *
+ * It updates status of the times of policy reloaded, and current
+ * setting of deny_unknown.
+ */
+void selinux_status_update_policyload(int seqno)
+{
+       struct selinux_kernel_status   *status;
+
+       mutex_lock(&selinux_status_lock);
+       if (selinux_status_page) {
+               status = page_address(selinux_status_page);
+
+               status->sequence++;
+               smp_wmb();
+
+               status->policyload = seqno;
+               status->deny_unknown = !security_get_allow_unknown();
+
+               smp_wmb();
+               status->sequence++;
+       }
+       mutex_unlock(&selinux_status_lock);
+}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c

index c448d57..bc39f40 100644 (file)
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p)
   *
   * Return 0 if read access is permitted
   */
-static int smack_task_setscheduler(struct task_struct *p, int policy,
-                                  struct sched_param *lp)
+static int smack_task_setscheduler(struct task_struct *p)
  {
         int rc;
  
-       rc = cap_task_setscheduler(p, policy, lp);
+       rc = cap_task_setscheduler(p);
         if (rc == 0)
                 rc = smk_curacc_on_task(p, MAY_WRITE);
         return rc;
@@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
  {
         char *sp = smack_from_secid(secid);
  
-       *secdata = sp;
+       if (secdata)
+               *secdata = sp;
         *seclen = strlen(sp);
         return 0;
  }
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c

index c668b44..7556315 100644 (file)
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data)
                 return true; /* Do nothing if open(O_WRONLY). */
         memset(&head->r, 0, sizeof(head->r));
         head->r.print_this_domain_only = true;
-       head->r.eof = !domain;
-       head->r.domain = &domain->list;
+       if (domain)
+               head->r.domain = &domain->list;
+       else
+               head->r.eof = 1;
         tomoyo_io_printf(head, "# select %s\n", data);
         if (domain && domain->is_deleted)
                 tomoyo_io_printf(head, "# This is a deleted domain.\n");
@@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void)
                 const u8 profile = domain->profile;
                 if (tomoyo_profile_ptr[profile])
                         continue;
+               printk(KERN_ERR "You need to define profile %u before using it.\n",
+                      profile);
+               printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+                      "for more information.\n");
                 panic("Profile %u (used by '%s') not defined.\n",
                       profile, domain->domainname->name);
         }
         tomoyo_read_unlock(idx);
-       if (tomoyo_profile_version != 20090903)
+       if (tomoyo_profile_version != 20090903) {
+               printk(KERN_ERR "You need to install userland programs for "
+                      "TOMOYO 2.3 and initialize policy configuration.\n");
+               printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
+                      "for more information.\n");
                 panic("Profile version %u is not supported.\n",
                       tomoyo_profile_version);
+       }
         printk(KERN_INFO "TOMOYO: 2.3.0\n");
         printk(KERN_INFO "Mandatory Access Control activated.\n");
  }
author	Robert Richter <robert.richter@amd.com>
	Mon, 25 Oct 2010 14:28:14 +0000 (16:28 +0200)
committer	Robert Richter <robert.richter@amd.com>
	Mon, 25 Oct 2010 14:29:12 +0000 (16:29 +0200)