Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 7 Jan 2012 01:22:09 +0000 (17:22 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 7 Jan 2012 01:22:09 +0000 (17:22 -0800)
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1958 commits)
  net: pack skb_shared_info more efficiently
  net_sched: red: split red_parms into parms and vars
  net_sched: sfq: extend limits
  cnic: Improve error recovery on bnx2x devices
  cnic: Re-init dev->stats_addr after chip reset
  net_sched: Bug in netem reordering
  bna: fix sparse warnings/errors
  bna: make ethtool_ops and strings const
  xgmac: cleanups
  net: make ethtool_ops const
  vmxnet3" make ethtool ops const
  xen-netback: make ops structs const
  virtio_net: Pass gfp flags when allocating rx buffers.
  ixgbe: FCoE: Add support for ndo_get_fcoe_hbainfo() call
  netdev: FCoE: Add new ndo_get_fcoe_hbainfo() call
  igb: reset PHY after recovering from PHY power down
  igb: add basic runtime PM support
  igb: Add support for byte queue limits.
  e1000: cleanup CE4100 MDIO registers access
  e1000: unmap ce4100_gbe_mdio_base_virt in e1000_remove
  ...

396 files changed:
Documentation/DocBook/debugobjects.tmpl
Documentation/RCU/checklist.txt
Documentation/RCU/rcu.txt
Documentation/RCU/stallwarn.txt
Documentation/RCU/torture.txt
Documentation/RCU/trace.txt
Documentation/RCU/whatisRCU.txt
Documentation/atomic_ops.txt
Documentation/kernel-parameters.txt
Documentation/lockdep-design.txt
Documentation/trace/events.txt
arch/Kconfig
arch/arm/kernel/process.c
arch/arm/kernel/setup.c
arch/arm/mm/init.c
arch/avr32/kernel/process.c
arch/blackfin/kernel/process.c
arch/cris/arch-v32/kernel/time.c
arch/ia64/Kconfig
arch/ia64/include/asm/cputime.h
arch/ia64/mm/contig.c
arch/ia64/mm/init.c
arch/m68k/platform/68328/timers.c
arch/m68k/platform/coldfire/dma_timer.c
arch/m68k/platform/coldfire/pit.c
arch/m68k/platform/coldfire/sltimers.c
arch/m68k/platform/coldfire/timers.c
arch/microblaze/include/asm/memblock.h [deleted file]
arch/microblaze/kernel/process.c
arch/microblaze/kernel/prom.c
arch/mips/Kconfig
arch/mips/kernel/process.c
arch/mips/kernel/setup.c
arch/mips/sgi-ip27/ip27-memory.c
arch/openrisc/include/asm/memblock.h [deleted file]
arch/openrisc/kernel/idle.c
arch/openrisc/kernel/prom.c
arch/parisc/kernel/time.c
arch/powerpc/Kconfig
arch/powerpc/include/asm/cputime.h
arch/powerpc/include/asm/memblock.h [deleted file]
arch/powerpc/kernel/idle.c
arch/powerpc/kernel/machine_kexec.c
arch/powerpc/kernel/prom.c
arch/powerpc/mm/init_32.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/numa.c
arch/powerpc/mm/tlb_nohash.c
arch/powerpc/platforms/embedded6xx/wii.c
arch/powerpc/platforms/iseries/setup.c
arch/powerpc/platforms/ps3/mm.c
arch/powerpc/platforms/pseries/lpar.c
arch/s390/Kconfig
arch/s390/appldata/appldata_os.c
arch/s390/include/asm/cputime.h
arch/s390/kernel/process.c
arch/s390/kernel/setup.c
arch/s390/oprofile/hwsampler.c
arch/s390/oprofile/init.c
arch/s390/oprofile/op_counter.h [new file with mode: 0644]
arch/score/Kconfig
arch/score/kernel/setup.c
arch/sh/Kconfig
arch/sh/include/asm/memblock.h [deleted file]
arch/sh/kernel/idle.c
arch/sh/kernel/machine_kexec.c
arch/sh/kernel/setup.c
arch/sh/mm/Kconfig
arch/sh/mm/init.c
arch/sparc/Kconfig
arch/sparc/include/asm/memblock.h [deleted file]
arch/sparc/kernel/process_64.c
arch/sparc/kernel/setup_32.c
arch/sparc/mm/init_64.c
arch/tile/kernel/process.c
arch/tile/mm/fault.c
arch/um/kernel/process.c
arch/um/kernel/time.c
arch/unicore32/kernel/process.c
arch/unicore32/kernel/setup.c
arch/unicore32/mm/init.c
arch/unicore32/mm/mmu.c
arch/x86/Kconfig
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/alternative-asm.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/apic_flat_64.h [new file with mode: 0644]
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/bitops.h
arch/x86/include/asm/cmpxchg.h
arch/x86/include/asm/cmpxchg_32.h
arch/x86/include/asm/cmpxchg_64.h
arch/x86/include/asm/div64.h
arch/x86/include/asm/e820.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/i387.h
arch/x86/include/asm/insn.h
arch/x86/include/asm/mach_timer.h
arch/x86/include/asm/mc146818rtc.h
arch/x86/include/asm/mce.h
arch/x86/include/asm/memblock.h [deleted file]
arch/x86/include/asm/microcode.h
arch/x86/include/asm/numachip/numachip_csr.h [new file with mode: 0644]
arch/x86/include/asm/percpu.h
arch/x86/include/asm/perf_event.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/processor-flags.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/spinlock.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/tsc.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/x86_init.h
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/amd_nb.c
arch/x86/kernel/aperture_64.c
arch/x86/kernel/apic/Makefile
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/apic_numachip.c [new file with mode: 0644]
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/check.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/centaur.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cpu.h
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/mcheck/mce-inject.c
arch/x86/kernel/cpu/mcheck/mce.c
arch/x86/kernel/cpu/mcheck/mce_amd.c
arch/x86/kernel/cpu/mcheck/therm_throt.c
arch/x86/kernel/cpu/mcheck/threshold.c
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/cpu/perf_event.h
arch/x86/kernel/cpu/perf_event_amd.c
arch/x86/kernel/cpu/perf_event_intel.c
arch/x86/kernel/cpu/powerflags.c
arch/x86/kernel/cpu/proc.c
arch/x86/kernel/e820.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/head.c
arch/x86/kernel/head32.c
arch/x86/kernel/head64.c
arch/x86/kernel/hpet.c
arch/x86/kernel/irq.c
arch/x86/kernel/jump_label.c
arch/x86/kernel/microcode_amd.c
arch/x86/kernel/microcode_core.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/setup.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/trampoline.c
arch/x86/kernel/traps.c
arch/x86/kernel/tsc.c
arch/x86/kernel/tsc_sync.c
arch/x86/kernel/vsyscall_64.c
arch/x86/kernel/x86_init.c
arch/x86/lib/inat.c
arch/x86/lib/insn.c
arch/x86/lib/string_32.c
arch/x86/lib/x86-opcode-map.txt
arch/x86/mm/Makefile
arch/x86/mm/extable.c
arch/x86/mm/fault.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/memblock.c [deleted file]
arch/x86/mm/memtest.c
arch/x86/mm/numa.c
arch/x86/mm/numa_32.c
arch/x86/mm/numa_64.c
arch/x86/mm/numa_emulation.c
arch/x86/mm/pageattr.c
arch/x86/mm/srat.c
arch/x86/oprofile/Makefile
arch/x86/oprofile/init.c
arch/x86/oprofile/nmi_int.c
arch/x86/oprofile/nmi_timer_int.c [deleted file]
arch/x86/platform/efi/efi.c
arch/x86/tools/Makefile
arch/x86/tools/gen-insn-attr-x86.awk
arch/x86/tools/insn_sanity.c [new file with mode: 0644]
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/setup.c
arch/xtensa/kernel/time.c
block/ioctl.c
drivers/base/cpu.c
drivers/char/random.c
drivers/clocksource/acpi_pm.c
drivers/clocksource/i8253.c
drivers/clocksource/tcb_clksrc.c
drivers/cpufreq/cpufreq_conservative.c
drivers/cpufreq/cpufreq_ondemand.c
drivers/cpufreq/cpufreq_stats.c
drivers/edac/i7core_edac.c
drivers/edac/mce_amd.c
drivers/edac/sb_edac.c
drivers/hwmon/coretemp.c
drivers/iommu/intel-iommu.c
drivers/lguest/x86/core.c
drivers/macintosh/rack-meter.c
drivers/oprofile/nmi_timer_int.c [new file with mode: 0644]
drivers/oprofile/oprof.c
drivers/oprofile/oprof.h
drivers/oprofile/timer_int.c
drivers/pci/Kconfig
drivers/pci/ioapic.c
fs/compat_ioctl.c
fs/ioctl.c
fs/proc/array.c
fs/proc/stat.c
fs/proc/uptime.c
include/asm-generic/cputime.h
include/linux/bitops.h
include/linux/bootmem.h
include/linux/cpu.h
include/linux/debugobjects.h
include/linux/hardirq.h
include/linux/jump_label.h
include/linux/kernel_stat.h
include/linux/latencytop.h
include/linux/lockdep.h
include/linux/memblock.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/perf_event.h
include/linux/poison.h
include/linux/rcupdate.h
include/linux/sched.h
include/linux/srcu.h
include/linux/tick.h
include/linux/wait.h
include/trace/events/rcu.h
include/trace/events/sched.h
init/Kconfig
init/main.c
kernel/Makefile
kernel/acct.c
kernel/cpu.c
kernel/debug/kdb/kdb_support.c
kernel/events/Makefile
kernel/events/callchain.c [new file with mode: 0644]
kernel/events/core.c
kernel/events/internal.h
kernel/exit.c
kernel/fork.c
kernel/itimer.c
kernel/jump_label.c
kernel/lockdep.c
kernel/panic.c
kernel/posix-cpu-timers.c
kernel/printk.c
kernel/rcu.h
kernel/rcupdate.c
kernel/rcutiny.c
kernel/rcutiny_plugin.h
kernel/rcutorture.c
kernel/rcutree.c
kernel/rcutree.h
kernel/rcutree_plugin.h
kernel/rcutree_trace.c
kernel/rtmutex-debug.c
kernel/rtmutex.c
kernel/sched.c [deleted file]
kernel/sched/Makefile [new file with mode: 0644]
kernel/sched/auto_group.c [new file with mode: 0644]
kernel/sched/auto_group.h [new file with mode: 0644]
kernel/sched/clock.c [new file with mode: 0644]
kernel/sched/core.c [new file with mode: 0644]
kernel/sched/cpupri.c [new file with mode: 0644]
kernel/sched/cpupri.h [new file with mode: 0644]
kernel/sched/debug.c [new file with mode: 0644]
kernel/sched/fair.c [new file with mode: 0644]
kernel/sched/features.h [new file with mode: 0644]
kernel/sched/idle_task.c [new file with mode: 0644]
kernel/sched/rt.c [new file with mode: 0644]
kernel/sched/sched.h [new file with mode: 0644]
kernel/sched/stats.c [new file with mode: 0644]
kernel/sched/stats.h [new file with mode: 0644]
kernel/sched/stop_task.c [new file with mode: 0644]
kernel/sched_autogroup.c [deleted file]
kernel/sched_autogroup.h [deleted file]
kernel/sched_clock.c [deleted file]
kernel/sched_cpupri.c [deleted file]
kernel/sched_cpupri.h [deleted file]
kernel/sched_debug.c [deleted file]
kernel/sched_fair.c [deleted file]
kernel/sched_features.h [deleted file]
kernel/sched_idletask.c [deleted file]
kernel/sched_rt.c [deleted file]
kernel/sched_stats.h [deleted file]
kernel/sched_stoptask.c [deleted file]
kernel/signal.c
kernel/softirq.c
kernel/sys.c
kernel/time/tick-sched.c
kernel/time/timekeeping.c
kernel/timer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_events_filter.c
kernel/trace/trace_irqsoff.c
kernel/trace/trace_output.c
kernel/trace/trace_sched_wakeup.c
kernel/tsacct.c
kernel/wait.c
lib/debugobjects.c
mm/Kconfig
mm/memblock.c
mm/nobootmem.c
mm/page_alloc.c
mm/slub.c
net/socket.c
tools/perf/Documentation/perf-annotate.txt
tools/perf/Documentation/perf-buildid-list.txt
tools/perf/Documentation/perf-evlist.txt
tools/perf/Documentation/perf-kmem.txt
tools/perf/Documentation/perf-lock.txt
tools/perf/Documentation/perf-record.txt
tools/perf/Documentation/perf-report.txt
tools/perf/Documentation/perf-sched.txt
tools/perf/Documentation/perf-script.txt
tools/perf/Documentation/perf-test.txt
tools/perf/Documentation/perf-timechart.txt
tools/perf/Makefile
tools/perf/builtin-annotate.c
tools/perf/builtin-buildid-list.c
tools/perf/builtin-diff.c
tools/perf/builtin-evlist.c
tools/perf/builtin-inject.c
tools/perf/builtin-kmem.c
tools/perf/builtin-kvm.c
tools/perf/builtin-lock.c
tools/perf/builtin-probe.c
tools/perf/builtin-record.c
tools/perf/builtin-report.c
tools/perf/builtin-sched.c
tools/perf/builtin-script.c
tools/perf/builtin-stat.c
tools/perf/builtin-test.c
tools/perf/builtin-timechart.c
tools/perf/builtin-top.c
tools/perf/perf.c
tools/perf/perf.h
tools/perf/util/annotate.c
tools/perf/util/annotate.h
tools/perf/util/build-id.c
tools/perf/util/build-id.h
tools/perf/util/callchain.h
tools/perf/util/cgroup.c
tools/perf/util/config.c
tools/perf/util/debugfs.c
tools/perf/util/debugfs.h
tools/perf/util/event.c
tools/perf/util/event.h
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/evsel.c
tools/perf/util/evsel.h
tools/perf/util/header.c
tools/perf/util/header.h
tools/perf/util/hist.h
tools/perf/util/include/linux/bitops.h
tools/perf/util/map.c
tools/perf/util/map.h
tools/perf/util/parse-events.c
tools/perf/util/parse-events.h
tools/perf/util/probe-finder.h
tools/perf/util/scripting-engines/trace-event-perl.c
tools/perf/util/scripting-engines/trace-event-python.c
tools/perf/util/session.c
tools/perf/util/session.h
tools/perf/util/setup.py
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/thread.c
tools/perf/util/thread.h
tools/perf/util/tool.h [new file with mode: 0644]
tools/perf/util/top.h
tools/perf/util/trace-event-info.c
tools/perf/util/trace-event-scripting.c
tools/perf/util/trace-event.h
tools/perf/util/ui/browsers/annotate.c
tools/perf/util/ui/browsers/hists.c
tools/perf/util/ui/progress.c
tools/perf/util/usage.c
tools/perf/util/util.h
tools/perf/util/values.c

index 08ff908..24979f6 100644 (file)
@@ -96,6 +96,7 @@
        <listitem><para>debug_object_deactivate</para></listitem>
        <listitem><para>debug_object_destroy</para></listitem>
        <listitem><para>debug_object_free</para></listitem>
+       <listitem><para>debug_object_assert_init</para></listitem>
       </itemizedlist>
       Each of these functions takes the address of the real object and
       a pointer to the object type specific debug description
        debug checks.
       </para>
     </sect1>
+
+    <sect1 id="debug_object_assert_init">
+      <title>debug_object_assert_init</title>
+      <para>
+       This function is called to assert that an object has been
+       initialized.
+      </para>
+      <para>
+       When the real object is not tracked by debugobjects, it calls
+       fixup_assert_init of the object type description structure
+       provided by the caller, with the hardcoded object state
+       ODEBUG_NOT_AVAILABLE. The fixup function can correct the problem
+       by calling debug_object_init and other specific initializing
+       functions.
+      </para>
+      <para>
+       When the real object is already tracked by debugobjects it is
+       ignored.
+      </para>
+    </sect1>
   </chapter>
   <chapter id="fixupfunctions">
     <title>Fixup functions</title>
        statistics.
       </para>
     </sect1>
+    <sect1 id="fixup_assert_init">
+      <title>fixup_assert_init</title>
+      <para>
+       This function is called from the debug code whenever a problem
+       in debug_object_assert_init is detected.
+      </para>
+      <para>
+       Called from debug_object_assert_init() with a hardcoded state
+       ODEBUG_STATE_NOTAVAILABLE when the object is not found in the
+       debug bucket.
+      </para>
+      <para>
+       The function returns 1 when the fixup was successful,
+       otherwise 0. The return value is used to update the
+       statistics.
+      </para>
+      <para>
+       Note, this function should make sure debug_object_init() is
+       called before returning.
+      </para>
+      <para>
+       The handling of statically initialized objects is a special
+       case. The fixup function should check if this is a legitimate
+       case of a statically initialized object or not. In this case only
+       debug_object_init() should be called to make the object known to
+       the tracker. Then the function should return 0 because this is not
+       a real fixup.
+      </para>
+    </sect1>
   </chapter>
   <chapter id="bugs">
     <title>Known Bugs And Assumptions</title>
index 0c134f8..bff2d8b 100644 (file)
@@ -328,6 +328,12 @@ over a rather long period of time, but improvements are always welcome!
        RCU rather than SRCU, because RCU is almost always faster and
        easier to use than is SRCU.
 
+       If you need to enter your read-side critical section in a
+       hardirq or exception handler, and then exit that same read-side
+       critical section in the task that was interrupted, then you need
+       to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid
+       the lockdep checking that would otherwise this practice illegal.
+
        Also unlike other forms of RCU, explicit initialization
        and cleanup is required via init_srcu_struct() and
        cleanup_srcu_struct().  These are passed a "struct srcu_struct"
index 3185270..bf77833 100644 (file)
@@ -38,11 +38,11 @@ o   How can the updater tell when a grace period has completed
 
        Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
        same effect, but require that the readers manipulate CPU-local
-       counters.  These counters allow limited types of blocking
-       within RCU read-side critical sections.  SRCU also uses
-       CPU-local counters, and permits general blocking within
-       RCU read-side critical sections.  These two variants of
-       RCU detect grace periods by sampling these counters.
+       counters.  These counters allow limited types of blocking within
+       RCU read-side critical sections.  SRCU also uses CPU-local
+       counters, and permits general blocking within RCU read-side
+       critical sections.  These variants of RCU detect grace periods
+       by sampling these counters.
 
 o      If I am running on a uniprocessor kernel, which can only do one
        thing at a time, why should I wait for a grace period?
index 4e95920..083d88c 100644 (file)
@@ -101,6 +101,11 @@ o  A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
        CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
        messages.
 
+o      A hardware or software issue shuts off the scheduler-clock
+       interrupt on a CPU that is not in dyntick-idle mode.  This
+       problem really has happened, and seems to be most likely to
+       result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels.
+
 o      A bug in the RCU implementation.
 
 o      A hardware failure.  This is quite unlikely, but has occurred
@@ -109,12 +114,11 @@ o A hardware failure.  This is quite unlikely, but has occurred
        This resulted in a series of RCU CPU stall warnings, eventually
        leading the realization that the CPU had failed.
 
-The RCU, RCU-sched, and RCU-bh implementations have CPU stall
-warning.  SRCU does not have its own CPU stall warnings, but its
-calls to synchronize_sched() will result in RCU-sched detecting
-RCU-sched-related CPU stalls.  Please note that RCU only detects
-CPU stalls when there is a grace period in progress.  No grace period,
-no CPU stall warnings.
+The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
+SRCU does not have its own CPU stall warnings, but its calls to
+synchronize_sched() will result in RCU-sched detecting RCU-sched-related
+CPU stalls.  Please note that RCU only detects CPU stalls when there is
+a grace period in progress.  No grace period, no CPU stall warnings.
 
 To diagnose the cause of the stall, inspect the stack traces.
 The offending function will usually be near the top of the stack.
index 783d6c1..d67068d 100644 (file)
@@ -61,11 +61,24 @@ nreaders    This is the number of RCU reading threads supported.
                To properly exercise RCU implementations with preemptible
                read-side critical sections.
 
+onoff_interval
+               The number of seconds between each attempt to execute a
+               randomly selected CPU-hotplug operation.  Defaults to
+               zero, which disables CPU hotplugging.  In HOTPLUG_CPU=n
+               kernels, rcutorture will silently refuse to do any
+               CPU-hotplug operations regardless of what value is
+               specified for onoff_interval.
+
 shuffle_interval
                The number of seconds to keep the test threads affinitied
                to a particular subset of the CPUs, defaults to 3 seconds.
                Used in conjunction with test_no_idle_hz.
 
+shutdown_secs  The number of seconds to run the test before terminating
+               the test and powering off the system.  The default is
+               zero, which disables test termination and system shutdown.
+               This capability is useful for automated testing.
+
 stat_interval  The number of seconds between output of torture
                statistics (via printk()).  Regardless of the interval,
                statistics are printed when the module is unloaded.
index aaf65f6..49587ab 100644 (file)
@@ -105,14 +105,10 @@ o "dt" is the current value of the dyntick counter that is incremented
        or one greater than the interrupt-nesting depth otherwise.
        The number after the second "/" is the NMI nesting depth.
 
-       This field is displayed only for CONFIG_NO_HZ kernels.
-
 o      "df" is the number of times that some other CPU has forced a
        quiescent state on behalf of this CPU due to this CPU being in
        dynticks-idle state.
 
-       This field is displayed only for CONFIG_NO_HZ kernels.
-
 o      "of" is the number of times that some other CPU has forced a
        quiescent state on behalf of this CPU due to this CPU being
        offline.  In a perfect world, this might never happen, but it
index 6ef6926..6bbe8dc 100644 (file)
@@ -4,6 +4,7 @@ to start learning about RCU:
 1.     What is RCU, Fundamentally?  http://lwn.net/Articles/262464/
 2.     What is RCU? Part 2: Usage   http://lwn.net/Articles/263130/
 3.     RCU part 3: the RCU API      http://lwn.net/Articles/264090/
+4.     The RCU API, 2010 Edition    http://lwn.net/Articles/418853/
 
 
 What is RCU?
@@ -834,6 +835,8 @@ SRCU:       Critical sections       Grace period            Barrier
 
        srcu_read_lock          synchronize_srcu        N/A
        srcu_read_unlock        synchronize_srcu_expedited
+       srcu_read_lock_raw
+       srcu_read_unlock_raw
        srcu_dereference
 
 SRCU:  Initialization/cleanup
@@ -855,27 +858,33 @@ list can be helpful:
 
 a.     Will readers need to block?  If so, you need SRCU.
 
-b.     What about the -rt patchset?  If readers would need to block
+b.     Is it necessary to start a read-side critical section in a
+       hardirq handler or exception handler, and then to complete
+       this read-side critical section in the task that was
+       interrupted?  If so, you need SRCU's srcu_read_lock_raw() and
+       srcu_read_unlock_raw() primitives.
+
+c.     What about the -rt patchset?  If readers would need to block
        in an non-rt kernel, you need SRCU.  If readers would block
        in a -rt kernel, but not in a non-rt kernel, SRCU is not
        necessary.
 
-c.     Do you need to treat NMI handlers, hardirq handlers,
+d.     Do you need to treat NMI handlers, hardirq handlers,
        and code segments with preemption disabled (whether
        via preempt_disable(), local_irq_save(), local_bh_disable(),
        or some other mechanism) as if they were explicit RCU readers?
        If so, you need RCU-sched.
 
-d.     Do you need RCU grace periods to complete even in the face
+e.     Do you need RCU grace periods to complete even in the face
        of softirq monopolization of one or more of the CPUs?  For
        example, is your code subject to network-based denial-of-service
        attacks?  If so, you need RCU-bh.
 
-e.     Is your workload too update-intensive for normal use of
+f.     Is your workload too update-intensive for normal use of
        RCU, but inappropriate for other synchronization mechanisms?
        If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
 
-f.     Otherwise, use RCU.
+g.     Otherwise, use RCU.
 
 Of course, this all assumes that you have determined that RCU is in fact
 the right tool for your job.
index 3bd585b..27f2b21 100644 (file)
@@ -84,6 +84,93 @@ compiler optimizes the section accessing atomic_t variables.
 
 *** YOU HAVE BEEN WARNED! ***
 
+Properly aligned pointers, longs, ints, and chars (and unsigned
+equivalents) may be atomically loaded from and stored to in the same
+sense as described for atomic_read() and atomic_set().  The ACCESS_ONCE()
+macro should be used to prevent the compiler from using optimizations
+that might otherwise optimize accesses out of existence on the one hand,
+or that might create unsolicited accesses on the other.
+
+For example consider the following code:
+
+       while (a > 0)
+               do_something();
+
+If the compiler can prove that do_something() does not store to the
+variable a, then the compiler is within its rights transforming this to
+the following:
+
+       tmp = a;
+       if (a > 0)
+               for (;;)
+                       do_something();
+
+If you don't want the compiler to do this (and you probably don't), then
+you should use something like the following:
+
+       while (ACCESS_ONCE(a) < 0)
+               do_something();
+
+Alternatively, you could place a barrier() call in the loop.
+
+For another example, consider the following code:
+
+       tmp_a = a;
+       do_something_with(tmp_a);
+       do_something_else_with(tmp_a);
+
+If the compiler can prove that do_something_with() does not store to the
+variable a, then the compiler is within its rights to manufacture an
+additional load as follows:
+
+       tmp_a = a;
+       do_something_with(tmp_a);
+       tmp_a = a;
+       do_something_else_with(tmp_a);
+
+This could fatally confuse your code if it expected the same value
+to be passed to do_something_with() and do_something_else_with().
+
+The compiler would be likely to manufacture this additional load if
+do_something_with() was an inline function that made very heavy use
+of registers: reloading from variable a could save a flush to the
+stack and later reload.  To prevent the compiler from attacking your
+code in this manner, write the following:
+
+       tmp_a = ACCESS_ONCE(a);
+       do_something_with(tmp_a);
+       do_something_else_with(tmp_a);
+
+For a final example, consider the following code, assuming that the
+variable a is set at boot time before the second CPU is brought online
+and never changed later, so that memory barriers are not needed:
+
+       if (a)
+               b = 9;
+       else
+               b = 42;
+
+The compiler is within its rights to manufacture an additional store
+by transforming the above code into the following:
+
+       b = 42;
+       if (a)
+               b = 9;
+
+This could come as a fatal surprise to other code running concurrently
+that expected b to never have the value 42 if a was zero.  To prevent
+the compiler from doing this, write something like:
+
+       if (a)
+               ACCESS_ONCE(b) = 9;
+       else
+               ACCESS_ONCE(b) = 42;
+
+Don't even -think- about doing this without proper use of memory barriers,
+locks, or atomic operations if variable a can change at runtime!
+
+*** WARNING: ACCESS_ONCE() DOES NOT IMPLY A BARRIER! ***
+
 Now, we move onto the atomic operation interfaces typically implemented with
 the help of assembly code.
 
index 81c287f..e229769 100644 (file)
@@ -1885,6 +1885,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        arch_perfmon: [X86] Force use of architectural
                                perfmon on Intel CPUs instead of the
                                CPU specific event set.
+                       timer: [X86] Force use of architectural NMI
+                               timer mode (see also oprofile.timer
+                               for generic hr timer mode)
+                               [s390] Force legacy basic mode sampling
+                                (report cpu_type "timer")
 
        oops=panic      Always panic on oopses. Default is to just kill the
                        process, but there is a small probability of
@@ -2750,11 +2755,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        functions are at fixed addresses, they make nice
                        targets for exploits that can control RIP.
 
-                       emulate     Vsyscalls turn into traps and are emulated
-                                   reasonably safely.
+                       emulate     [default] Vsyscalls turn into traps and are
+                                   emulated reasonably safely.
 
-                       native      [default] Vsyscalls are native syscall
-                                   instructions.
+                       native      Vsyscalls are native syscall instructions.
                                    This is a little bit faster than trapping
                                    and makes a few dynamic recompilers work
                                    better than they would in emulation mode.
index abf768c..5dbc99c 100644 (file)
@@ -221,3 +221,66 @@ when the chain is validated for the first time, is then put into a hash
 table, which hash-table can be checked in a lockfree manner. If the
 locking chain occurs again later on, the hash table tells us that we
 dont have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+       (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks.  These two problems are illustrated below:
+
+1.     Repeated module loading and unloading while running the validator
+       will result in lock-class leakage.  The issue here is that each
+       load of the module will create a new set of lock classes for
+       that module's locks, but module unloading does not remove old
+       classes (see below discussion of reuse of lock classes for why).
+       Therefore, if that module is loaded and unloaded repeatedly,
+       the number of lock classes will eventually reach the maximum.
+
+2.     Using structures such as arrays that have large numbers of
+       locks that are not explicitly initialized.  For example,
+       a hash table with 8192 buckets where each bucket has its own
+       spinlock_t will consume 8192 lock classes -unless- each spinlock
+       is explicitly initialized at runtime, for example, using the
+       run-time spin_lock_init() as opposed to compile-time initializers
+       such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
+       the per-bucket spinlocks would guarantee lock-class overflow.
+       In contrast, a loop that called spin_lock_init() on each lock
+       would place all 8192 locks into a single lock class.
+
+       The moral of this story is that you should always explicitly
+       initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused.  However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph.  This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes.  First, the following command gives
+you the number of lock classes currently in use along with the maximum:
+
+       grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system:
+
+        lock-classes:                          748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak.  The following command can be used to
+identify the leaking lock classes:
+
+       grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers.  This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
index b510564..bb24c2a 100644 (file)
@@ -191,8 +191,6 @@ And for string fields they are:
 
 Currently, only exact string matches are supported.
 
-Currently, the maximum number of predicates in a filter is 16.
-
 5.2 Setting filters
 -------------------
 
index 4b0669c..2505740 100644 (file)
@@ -30,6 +30,10 @@ config OPROFILE_EVENT_MULTIPLEX
 config HAVE_OPROFILE
        bool
 
+config OPROFILE_NMI_TIMER
+       def_bool y
+       depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+
 config KPROBES
        bool "Kprobes"
        depends on MODULES
index 3d0c6fb..e8e8fe5 100644 (file)
@@ -183,7 +183,8 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                leds_event(led_idle_start);
                while (!need_resched()) {
 #ifdef CONFIG_HOTPLUG_CPU
@@ -213,7 +214,8 @@ void cpu_idle(void)
                        }
                }
                leds_event(led_idle_end);
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 8fc2c8f..c0b59bf 100644 (file)
@@ -52,6 +52,7 @@
 #include <asm/mach/time.h>
 #include <asm/traps.h>
 #include <asm/unwind.h>
+#include <asm/memblock.h>
 
 #if defined(CONFIG_DEPRECATED_PARAM_STRUCT)
 #include "compat.h"
index fbdd12e..7c38474 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
+#include <asm/memblock.h>
 
 #include "mm.h"
 
@@ -332,7 +333,6 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
 
        sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
 
-       memblock_init();
        for (i = 0; i < mi->nr_banks; i++)
                memblock_add(mi->bank[i].start, mi->bank[i].size);
 
@@ -371,7 +371,7 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
        if (mdesc->reserve)
                mdesc->reserve();
 
-       memblock_analyze();
+       memblock_allow_resize();
        memblock_dump_all();
 }
 
index ef5a2a0..ea33957 100644 (file)
@@ -34,10 +34,12 @@ void cpu_idle(void)
 {
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched())
                        cpu_idle_sleep();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 6a80a9e..8dd0416 100644 (file)
@@ -88,10 +88,12 @@ void cpu_idle(void)
 #endif
                if (!idle)
                        idle = default_idle;
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched())
                        idle();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index bb978ed..6773fc8 100644 (file)
@@ -47,14 +47,12 @@ static struct clocksource cont_rotime = {
        .rating = 300,
        .read   = read_cont_rotime,
        .mask   = CLOCKSOURCE_MASK(32),
-       .shift  = 10,
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
 static int __init etrax_init_cont_rotime(void)
 {
-       cont_rotime.mult = clocksource_khz2mult(100000, cont_rotime.shift);
-       clocksource_register(&cont_rotime);
+       clocksource_register_khz(&cont_rotime, 100000);
        return 0;
 }
 arch_initcall(etrax_init_cont_rotime);
index 27489b6..3b7a7c4 100644 (file)
@@ -23,6 +23,9 @@ config IA64
        select HAVE_ARCH_TRACEHOOK
        select HAVE_DMA_API_DEBUG
        select HAVE_GENERIC_HARDIRQS
+       select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
+       select ARCH_DISCARD_MEMBLOCK
        select GENERIC_IRQ_PROBE
        select GENERIC_PENDING_IRQ if SMP
        select IRQ_PER_CPU
@@ -474,9 +477,6 @@ config NODES_SHIFT
          MAX_NUMNODES will be 2^(This value).
          If in doubt, use the default.
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent.
 # VIRTUAL_MEM_MAP has been retained for historical reasons.
 config VIRTUAL_MEM_MAP
index 5a274af..3deac95 100644 (file)
 #include <linux/jiffies.h>
 #include <asm/processor.h>
 
-typedef u64 cputime_t;
-typedef u64 cputime64_t;
+typedef u64 __nocast cputime_t;
+typedef u64 __nocast cputime64_t;
 
-#define cputime_zero                   ((cputime_t)0)
 #define cputime_one_jiffy              jiffies_to_cputime(1)
-#define cputime_max                    ((~((cputime_t)0) >> 1) - 1)
-#define cputime_add(__a, __b)          ((__a) +  (__b))
-#define cputime_sub(__a, __b)          ((__a) -  (__b))
-#define cputime_div(__a, __n)          ((__a) /  (__n))
-#define cputime_halve(__a)             ((__a) >> 1)
-#define cputime_eq(__a, __b)           ((__a) == (__b))
-#define cputime_gt(__a, __b)           ((__a) >  (__b))
-#define cputime_ge(__a, __b)           ((__a) >= (__b))
-#define cputime_lt(__a, __b)           ((__a) <  (__b))
-#define cputime_le(__a, __b)           ((__a) <= (__b))
-
-#define cputime64_zero                 ((cputime64_t)0)
-#define cputime64_add(__a, __b)                ((__a) + (__b))
-#define cputime64_sub(__a, __b)                ((__a) - (__b))
-#define cputime_to_cputime64(__ct)     (__ct)
 
 /*
  * Convert cputime <-> jiffies (HZ)
  */
-#define cputime_to_jiffies(__ct)       ((__ct) / (NSEC_PER_SEC / HZ))
-#define jiffies_to_cputime(__jif)      ((__jif) * (NSEC_PER_SEC / HZ))
-#define cputime64_to_jiffies64(__ct)   ((__ct) / (NSEC_PER_SEC / HZ))
-#define jiffies64_to_cputime64(__jif)  ((__jif) * (NSEC_PER_SEC / HZ))
+#define cputime_to_jiffies(__ct)       \
+       ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
+#define jiffies_to_cputime(__jif)      \
+       (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
+#define cputime64_to_jiffies64(__ct)   \
+       ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
+#define jiffies64_to_cputime64(__jif)  \
+       (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
 
 /*
  * Convert cputime <-> microseconds
  */
-#define cputime_to_usecs(__ct)         ((__ct) / NSEC_PER_USEC)
-#define usecs_to_cputime(__usecs)      ((__usecs) * NSEC_PER_USEC)
-#define usecs_to_cputime64(__usecs)    usecs_to_cputime(__usecs)
+#define cputime_to_usecs(__ct)         \
+       ((__force u64)(__ct) / NSEC_PER_USEC)
+#define usecs_to_cputime(__usecs)      \
+       (__force cputime_t)((__usecs) * NSEC_PER_USEC)
+#define usecs_to_cputime64(__usecs)    \
+       (__force cputime64_t)((__usecs) * NSEC_PER_USEC)
 
 /*
  * Convert cputime <-> seconds
  */
-#define cputime_to_secs(__ct)          ((__ct) / NSEC_PER_SEC)
-#define secs_to_cputime(__secs)                ((__secs) * NSEC_PER_SEC)
+#define cputime_to_secs(__ct)          \
+       ((__force u64)(__ct) / NSEC_PER_SEC)
+#define secs_to_cputime(__secs)                \
+       (__force cputime_t)((__secs) * NSEC_PER_SEC)
 
 /*
  * Convert cputime <-> timespec (nsec)
  */
 static inline cputime_t timespec_to_cputime(const struct timespec *val)
 {
-       cputime_t ret = val->tv_sec * NSEC_PER_SEC;
-       return (ret + val->tv_nsec);
+       u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
+       return (__force cputime_t) ret;
 }
 static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
 {
-       val->tv_sec  = ct / NSEC_PER_SEC;
-       val->tv_nsec = ct % NSEC_PER_SEC;
+       val->tv_sec  = (__force u64) ct / NSEC_PER_SEC;
+       val->tv_nsec = (__force u64) ct % NSEC_PER_SEC;
 }
 
 /*
@@ -87,25 +80,28 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
  */
 static inline cputime_t timeval_to_cputime(struct timeval *val)
 {
-       cputime_t ret = val->tv_sec * NSEC_PER_SEC;
-       return (ret + val->tv_usec * NSEC_PER_USEC);
+       u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
+       return (__force cputime_t) ret;
 }
 static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
 {
-       val->tv_sec = ct / NSEC_PER_SEC;
-       val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC;
+       val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
+       val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC;
 }
 
 /*
  * Convert cputime <-> clock (USER_HZ)
  */
-#define cputime_to_clock_t(__ct)       ((__ct) / (NSEC_PER_SEC / USER_HZ))
-#define clock_t_to_cputime(__x)                ((__x) * (NSEC_PER_SEC / USER_HZ))
+#define cputime_to_clock_t(__ct)       \
+       ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ))
+#define clock_t_to_cputime(__x)                \
+       (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
 
 /*
  * Convert cputime64 to clock.
  */
-#define cputime64_to_clock_t(__ct)      cputime_to_clock_t((cputime_t)__ct)
+#define cputime64_to_clock_t(__ct)     \
+       cputime_to_clock_t((__force cputime_t)__ct)
 
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 #endif /* __IA64_CPUTIME_H */
index f114a3b..1516d1d 100644 (file)
@@ -16,6 +16,7 @@
  */
 #include <linux/bootmem.h>
 #include <linux/efi.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/nmi.h>
 #include <linux/swap.h>
@@ -348,7 +349,7 @@ paging_init (void)
                printk("Virtual mem_map starts at 0x%p\n", mem_map);
        }
 #else /* !CONFIG_VIRTUAL_MEM_MAP */
-       add_active_range(0, 0, max_low_pfn);
+       memblock_add_node(0, PFN_PHYS(max_low_pfn), 0);
        free_area_init_nodes(max_zone_pfns);
 #endif /* !CONFIG_VIRTUAL_MEM_MAP */
        zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
index 00cb0e2..13df239 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/bootmem.h>
 #include <linux/efi.h>
 #include <linux/elf.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
@@ -557,8 +558,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid)
 #endif
 
        if (start < end)
-               add_active_range(nid, __pa(start) >> PAGE_SHIFT,
-                       __pa(end) >> PAGE_SHIFT);
+               memblock_add_node(__pa(start), end - start, nid);
        return 0;
 }
 
index 309f725..f267886 100644 (file)
@@ -93,7 +93,6 @@ static struct clocksource m68328_clk = {
        .name   = "timer",
        .rating = 250,
        .read   = m68328_read_clk,
-       .shift  = 20,
        .mask   = CLOCKSOURCE_MASK(32),
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
@@ -115,8 +114,7 @@ void hw_timer_init(void)
 
        /* Enable timer 1 */
        TCTL |= TCTL_TEN;
-       m68328_clk.mult = clocksource_hz2mult(TICKS_PER_JIFFY*HZ, m68328_clk.shift);
-       clocksource_register(&m68328_clk);
+       clocksource_register_hz(&m68328_clk, TICKS_PER_JIFFY*HZ);
 }
 
 /***************************************************************************/
index a5f5628..235ad57 100644 (file)
@@ -44,7 +44,6 @@ static struct clocksource clocksource_cf_dt = {
        .rating         = 200,
        .read           = cf_dt_get_cycles,
        .mask           = CLOCKSOURCE_MASK(32),
-       .shift          = 20,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -60,9 +59,7 @@ static int __init  init_cf_dt_clocksource(void)
        __raw_writeb(0x00, DTER0);
        __raw_writel(0x00000000, DTRR0);
        __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0);
-       clocksource_cf_dt.mult = clocksource_hz2mult(DMA_FREQ,
-                                                    clocksource_cf_dt.shift);
-       return clocksource_register(&clocksource_cf_dt);
+       return clocksource_register_hz(&clocksource_cf_dt, DMA_FREQ);
 }
 
 arch_initcall(init_cf_dt_clocksource);
index c2b9809..02663d2 100644 (file)
@@ -144,7 +144,6 @@ static struct clocksource pit_clk = {
        .name   = "pit",
        .rating = 100,
        .read   = pit_read_clk,
-       .shift  = 20,
        .mask   = CLOCKSOURCE_MASK(32),
 };
 
@@ -162,8 +161,7 @@ void hw_timer_init(void)
 
        setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq);
 
-       pit_clk.mult = clocksource_hz2mult(FREQ, pit_clk.shift);
-       clocksource_register(&pit_clk);
+       clocksource_register_hz(&pit_clk, FREQ);
 }
 
 /***************************************************************************/
index 6a85daf..b7f822b 100644 (file)
@@ -114,7 +114,6 @@ static struct clocksource mcfslt_clk = {
        .name   = "slt",
        .rating = 250,
        .read   = mcfslt_read_clk,
-       .shift  = 20,
        .mask   = CLOCKSOURCE_MASK(32),
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
@@ -136,8 +135,7 @@ void hw_timer_init(void)
 
        setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq);
 
-       mcfslt_clk.mult = clocksource_hz2mult(MCF_BUSCLK, mcfslt_clk.shift);
-       clocksource_register(&mcfslt_clk);
+       clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK);
 
 #ifdef CONFIG_HIGHPROFILE
        mcfslt_profile_init();
index 60242f6..0d90da3 100644 (file)
@@ -88,7 +88,6 @@ static struct clocksource mcftmr_clk = {
        .name   = "tmr",
        .rating = 250,
        .read   = mcftmr_read_clk,
-       .shift  = 20,
        .mask   = CLOCKSOURCE_MASK(32),
        .flags  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
@@ -109,8 +108,7 @@ void hw_timer_init(void)
        __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 |
                MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR));
 
-       mcftmr_clk.mult = clocksource_hz2mult(FREQ, mcftmr_clk.shift);
-       clocksource_register(&mcftmr_clk);
+       clocksource_register_hz(&mcftmr_clk, FREQ);
 
        setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq);
 
diff --git a/arch/microblaze/include/asm/memblock.h b/arch/microblaze/include/asm/memblock.h
deleted file mode 100644 (file)
index 20a8e25..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * Copyright (C) 2008 Michal Simek <monstr@monstr.eu>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#ifndef _ASM_MICROBLAZE_MEMBLOCK_H
-#define _ASM_MICROBLAZE_MEMBLOCK_H
-
-#endif /* _ASM_MICROBLAZE_MEMBLOCK_H */
-
-
index 95cc295..7dcb5bf 100644 (file)
@@ -103,10 +103,12 @@ void cpu_idle(void)
                if (!idle)
                        idle = default_idle;
 
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched())
                        idle();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
 
                preempt_enable_no_resched();
                schedule();
index 977484a..80d314e 100644 (file)
@@ -122,7 +122,6 @@ void __init early_init_devtree(void *params)
        of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line);
 
        /* Scan memory nodes and rebuild MEMBLOCKs */
-       memblock_init();
        of_scan_flat_dt(early_init_dt_scan_root, NULL);
        of_scan_flat_dt(early_init_dt_scan_memory, NULL);
 
@@ -130,7 +129,7 @@ void __init early_init_devtree(void *params)
        strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
        parse_early_param();
 
-       memblock_analyze();
+       memblock_allow_resize();
 
        pr_debug("Phys. mem: %lx\n", (unsigned long) memblock_phys_mem_size());
 
index d46f1da..9c652eb 100644 (file)
@@ -25,6 +25,9 @@ config MIPS
        select GENERIC_IRQ_SHOW
        select HAVE_ARCH_JUMP_LABEL
        select IRQ_FORCED_THREADING
+       select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
+       select ARCH_DISCARD_MEMBLOCK
 
 menu "Machine selection"
 
@@ -2064,9 +2067,6 @@ config ARCH_DISCONTIGMEM_ENABLE
          or have huge holes in the physical address space for other reasons.
          See <file:Documentation/vm/numa> for more.
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 config ARCH_SPARSEMEM_ENABLE
        bool
        select SPARSEMEM_STATIC
index c47f96e..7955409 100644 (file)
@@ -56,7 +56,8 @@ void __noreturn cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched() && cpu_online(cpu)) {
 #ifdef CONFIG_MIPS_MT_SMTC
                        extern void smtc_idle_loop_hook(void);
@@ -77,7 +78,8 @@ void __noreturn cpu_idle(void)
                     system_state == SYSTEM_BOOTING))
                        play_dead();
 #endif
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 84af26a..b1cb8f8 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/ioport.h>
 #include <linux/export.h>
 #include <linux/screen_info.h>
+#include <linux/memblock.h>
 #include <linux/bootmem.h>
 #include <linux/initrd.h>
 #include <linux/root_dev.h>
@@ -352,7 +353,7 @@ static void __init bootmem_init(void)
                        continue;
 #endif
 
-               add_active_range(0, start, end);
+               memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0);
        }
 
        /*
index bc12971..b105eca 100644 (file)
@@ -12,6 +12,7 @@
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
@@ -381,8 +382,8 @@ static void __init szmem(void)
                                continue;
                        }
                        num_physpages += slot_psize;
-                       add_active_range(node, slot_getbasepfn(node, slot),
-                                        slot_getbasepfn(node, slot) + slot_psize);
+                       memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)),
+                                         PFN_PHYS(slot_psize), node);
                }
        }
 }
diff --git a/arch/openrisc/include/asm/memblock.h b/arch/openrisc/include/asm/memblock.h
deleted file mode 100644 (file)
index bbe5a1c..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * OpenRISC Linux
- *
- * Linux architectural port borrowing liberally from similar works of
- * others.  All original copyrights apply as per the original source
- * declaration.
- *
- * OpenRISC implementation:
- * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com>
- * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se>
- * et al.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef __ASM_OPENRISC_MEMBLOCK_H
-#define __ASM_OPENRISC_MEMBLOCK_H
-
-/* empty */
-
-#endif /* __ASM_OPENRISC_MEMBLOCK_H */
index d5bc5f8..e5fc788 100644 (file)
@@ -51,7 +51,8 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
 
                while (!need_resched()) {
                        check_pgt_cache();
@@ -69,7 +70,8 @@ void cpu_idle(void)
                        set_thread_flag(TIF_POLLING_NRFLAG);
                }
 
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 1bb58ba..3d4478f 100644 (file)
@@ -76,14 +76,13 @@ void __init early_init_devtree(void *params)
        of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line);
 
        /* Scan memory nodes and rebuild MEMBLOCKs */
-       memblock_init();
        of_scan_flat_dt(early_init_dt_scan_root, NULL);
        of_scan_flat_dt(early_init_dt_scan_memory, NULL);
 
        /* Save command line for /proc/cmdline and then parse parameters */
        strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE);
 
-       memblock_analyze();
+       memblock_allow_resize();
 
        /* We must copy the flattend device tree from init memory to regular
         * memory because the device tree references the strings in it
index 45b7389..7c07743 100644 (file)
@@ -198,8 +198,6 @@ static struct clocksource clocksource_cr16 = {
        .rating                 = 300,
        .read                   = read_cr16,
        .mask                   = CLOCKSOURCE_MASK(BITS_PER_LONG),
-       .mult                   = 0, /* to be set */
-       .shift                  = 22,
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -270,7 +268,5 @@ void __init time_init(void)
 
        /* register at clocksource framework */
        current_cr16_khz = PAGE0->mem_10msec/10;  /* kHz */
-       clocksource_cr16.mult = clocksource_khz2mult(current_cr16_khz,
-                                               clocksource_cr16.shift);
-       clocksource_register(&clocksource_cr16);
+       clocksource_register_khz(&clocksource_cr16, current_cr16_khz);
 }
index 951e18f..ead0bc6 100644 (file)
@@ -117,6 +117,7 @@ config PPC
        select HAVE_KRETPROBES
        select HAVE_ARCH_TRACEHOOK
        select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_DMA_ATTRS
        select HAVE_DMA_API_DEBUG
        select USE_GENERIC_SMP_HELPERS if SMP
@@ -421,9 +422,6 @@ config ARCH_SPARSEMEM_DEFAULT
        def_bool y
        depends on (SMP && PPC_PSERIES) || PPC_PS3
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 config SYS_SUPPORTS_HUGETLBFS
        bool
 
index 98b7c4b..6ec1c38 100644 (file)
@@ -29,25 +29,8 @@ static inline void setup_cputime_one_jiffy(void) { }
 #include <asm/time.h>
 #include <asm/param.h>
 
-typedef u64 cputime_t;
-typedef u64 cputime64_t;
-
-#define cputime_zero                   ((cputime_t)0)
-#define cputime_max                    ((~((cputime_t)0) >> 1) - 1)
-#define cputime_add(__a, __b)          ((__a) +  (__b))
-#define cputime_sub(__a, __b)          ((__a) -  (__b))
-#define cputime_div(__a, __n)          ((__a) /  (__n))
-#define cputime_halve(__a)             ((__a) >> 1)
-#define cputime_eq(__a, __b)           ((__a) == (__b))
-#define cputime_gt(__a, __b)           ((__a) >  (__b))
-#define cputime_ge(__a, __b)           ((__a) >= (__b))
-#define cputime_lt(__a, __b)           ((__a) <  (__b))
-#define cputime_le(__a, __b)           ((__a) <= (__b))
-
-#define cputime64_zero                 ((cputime64_t)0)
-#define cputime64_add(__a, __b)                ((__a) + (__b))
-#define cputime64_sub(__a, __b)                ((__a) - (__b))
-#define cputime_to_cputime64(__ct)     (__ct)
+typedef u64 __nocast cputime_t;
+typedef u64 __nocast cputime64_t;
 
 #ifdef __KERNEL__
 
@@ -65,7 +48,7 @@ DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta);
 
 static inline unsigned long cputime_to_jiffies(const cputime_t ct)
 {
-       return mulhdu(ct, __cputime_jiffies_factor);
+       return mulhdu((__force u64) ct, __cputime_jiffies_factor);
 }
 
 /* Estimate the scaled cputime by scaling the real cputime based on
@@ -74,14 +57,15 @@ static inline cputime_t cputime_to_scaled(const cputime_t ct)
 {
        if (cpu_has_feature(CPU_FTR_SPURR) &&
            __get_cpu_var(cputime_last_delta))
-               return ct * __get_cpu_var(cputime_scaled_last_delta) /
-                           __get_cpu_var(cputime_last_delta);
+               return (__force u64) ct *
+                       __get_cpu_var(cputime_scaled_last_delta) /
+                       __get_cpu_var(cputime_last_delta);
        return ct;
 }
 
 static inline cputime_t jiffies_to_cputime(const unsigned long jif)
 {
-       cputime_t ct;
+       u64 ct;
        unsigned long sec;
 
        /* have to be a little careful about overflow */
@@ -93,7 +77,7 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif)
        }
        if (sec)
                ct += (cputime_t) sec * tb_ticks_per_sec;
-       return ct;
+       return (__force cputime_t) ct;
 }
 
 static inline void setup_cputime_one_jiffy(void)
@@ -103,7 +87,7 @@ static inline void setup_cputime_one_jiffy(void)
 
 static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
 {
-       cputime_t ct;
+       u64 ct;
        u64 sec;
 
        /* have to be a little careful about overflow */
@@ -114,13 +98,13 @@ static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
                do_div(ct, HZ);
        }
        if (sec)
-               ct += (cputime_t) sec * tb_ticks_per_sec;
-       return ct;
+               ct += (u64) sec * tb_ticks_per_sec;
+       return (__force cputime64_t) ct;
 }
 
 static inline u64 cputime64_to_jiffies64(const cputime_t ct)
 {
-       return mulhdu(ct, __cputime_jiffies_factor);
+       return mulhdu((__force u64) ct, __cputime_jiffies_factor);
 }
 
 /*
@@ -130,12 +114,12 @@ extern u64 __cputime_msec_factor;
 
 static inline unsigned long cputime_to_usecs(const cputime_t ct)
 {
-       return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC;
+       return mulhdu((__force u64) ct, __cputime_msec_factor) * USEC_PER_MSEC;
 }
 
 static inline cputime_t usecs_to_cputime(const unsigned long us)
 {
-       cputime_t ct;
+       u64 ct;
        unsigned long sec;
 
        /* have to be a little careful about overflow */
@@ -147,7 +131,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us)
        }
        if (sec)
                ct += (cputime_t) sec * tb_ticks_per_sec;
-       return ct;
+       return (__force cputime_t) ct;
 }
 
 #define usecs_to_cputime64(us)         usecs_to_cputime(us)
@@ -159,12 +143,12 @@ extern u64 __cputime_sec_factor;
 
 static inline unsigned long cputime_to_secs(const cputime_t ct)
 {
-       return mulhdu(ct, __cputime_sec_factor);
+       return mulhdu((__force u64) ct, __cputime_sec_factor);
 }
 
 static inline cputime_t secs_to_cputime(const unsigned long sec)
 {
-       return (cputime_t) sec * tb_ticks_per_sec;
+       return (__force cputime_t)((u64) sec * tb_ticks_per_sec);
 }
 
 /*
@@ -172,7 +156,7 @@ static inline cputime_t secs_to_cputime(const unsigned long sec)
  */
 static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
 {
-       u64 x = ct;
+       u64 x = (__force u64) ct;
        unsigned int frac;
 
        frac = do_div(x, tb_ticks_per_sec);
@@ -184,11 +168,11 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
 
 static inline cputime_t timespec_to_cputime(const struct timespec *p)
 {
-       cputime_t ct;
+       u64 ct;
 
        ct = (u64) p->tv_nsec * tb_ticks_per_sec;
        do_div(ct, 1000000000);
-       return ct + (u64) p->tv_sec * tb_ticks_per_sec;
+       return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
 }
 
 /*
@@ -196,7 +180,7 @@ static inline cputime_t timespec_to_cputime(const struct timespec *p)
  */
 static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
 {
-       u64 x = ct;
+       u64 x = (__force u64) ct;
        unsigned int frac;
 
        frac = do_div(x, tb_ticks_per_sec);
@@ -208,11 +192,11 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
 
 static inline cputime_t timeval_to_cputime(const struct timeval *p)
 {
-       cputime_t ct;
+       u64 ct;
 
        ct = (u64) p->tv_usec * tb_ticks_per_sec;
        do_div(ct, 1000000);
-       return ct + (u64) p->tv_sec * tb_ticks_per_sec;
+       return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
 }
 
 /*
@@ -222,12 +206,12 @@ extern u64 __cputime_clockt_factor;
 
 static inline unsigned long cputime_to_clock_t(const cputime_t ct)
 {
-       return mulhdu(ct, __cputime_clockt_factor);
+       return mulhdu((__force u64) ct, __cputime_clockt_factor);
 }
 
 static inline cputime_t clock_t_to_cputime(const unsigned long clk)
 {
-       cputime_t ct;
+       u64 ct;
        unsigned long sec;
 
        /* have to be a little careful about overflow */
@@ -238,8 +222,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
                do_div(ct, USER_HZ);
        }
        if (sec)
-               ct += (cputime_t) sec * tb_ticks_per_sec;
-       return ct;
+               ct += (u64) sec * tb_ticks_per_sec;
+       return (__force cputime_t) ct;
 }
 
 #define cputime64_to_clock_t(ct)       cputime_to_clock_t((cputime_t)(ct))
diff --git a/arch/powerpc/include/asm/memblock.h b/arch/powerpc/include/asm/memblock.h
deleted file mode 100644 (file)
index 43efc34..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_POWERPC_MEMBLOCK_H
-#define _ASM_POWERPC_MEMBLOCK_H
-
-#include <asm/udbg.h>
-
-#define MEMBLOCK_DBG(fmt...) udbg_printf(fmt)
-
-#endif /* _ASM_POWERPC_MEMBLOCK_H */
index 39a2baa..9c3cd49 100644 (file)
@@ -46,6 +46,12 @@ static int __init powersave_off(char *arg)
 }
 __setup("powersave=off", powersave_off);
 
+#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_TRACEPOINTS)
+static const bool idle_uses_rcu = 1;
+#else
+static const bool idle_uses_rcu;
+#endif
+
 /*
  * The body of the idle task.
  */
@@ -56,7 +62,10 @@ void cpu_idle(void)
 
        set_thread_flag(TIF_POLLING_NRFLAG);
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               if (!idle_uses_rcu)
+                       rcu_idle_enter();
+
                while (!need_resched() && !cpu_should_die()) {
                        ppc64_runlatch_off();
 
@@ -93,7 +102,9 @@ void cpu_idle(void)
 
                HMT_medium();
                ppc64_runlatch_on();
-               tick_nohz_restart_sched_tick();
+               if (!idle_uses_rcu)
+                       rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                if (cpu_should_die())
                        cpu_die();
index 9ce1672..a2158a3 100644 (file)
@@ -107,9 +107,6 @@ void __init reserve_crashkernel(void)
        unsigned long long crash_size, crash_base;
        int ret;
 
-       /* this is necessary because of memblock_phys_mem_size() */
-       memblock_analyze();
-
        /* use common parsing */
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                        &crash_size, &crash_base);
index fa1235b..abe405d 100644 (file)
@@ -733,8 +733,6 @@ void __init early_init_devtree(void *params)
        of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line);
 
        /* Scan memory nodes and rebuild MEMBLOCKs */
-       memblock_init();
-
        of_scan_flat_dt(early_init_dt_scan_root, NULL);
        of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
 
@@ -756,20 +754,14 @@ void __init early_init_devtree(void *params)
        early_reserve_mem();
        phyp_dump_reserve_mem();
 
-       limit = memory_limit;
-       if (! limit) {
-               phys_addr_t memsize;
-
-               /* Ensure that total memory size is page-aligned, because
-                * otherwise mark_bootmem() gets upset. */
-               memblock_analyze();
-               memsize = memblock_phys_mem_size();
-               if ((memsize & PAGE_MASK) != memsize)
-                       limit = memsize & PAGE_MASK;
-       }
+       /*
+        * Ensure that total memory size is page-aligned, because otherwise
+        * mark_bootmem() gets upset.
+        */
+       limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
        memblock_enforce_memory_limit(limit);
 
-       memblock_analyze();
+       memblock_allow_resize();
        memblock_dump_all();
 
        DBG("Phys. mem: %llx\n", memblock_phys_mem_size());
index 161cefd..58861fa 100644 (file)
@@ -134,8 +134,7 @@ void __init MMU_init(void)
 
        if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
-               memblock.memory.cnt = 1;
-               memblock_analyze();
+               memblock_enforce_memory_limit(memblock.memory.regions[0].size);
                printk(KERN_WARNING "Only using first contiguous memory region");
 #else
                wii_memory_fixups();
@@ -158,7 +157,6 @@ void __init MMU_init(void)
 #ifndef CONFIG_HIGHMEM
                total_memory = total_lowmem;
                memblock_enforce_memory_limit(total_lowmem);
-               memblock_analyze();
 #endif /* CONFIG_HIGHMEM */
        }
 
index 2dd6bdd..8e2eb66 100644 (file)
@@ -199,7 +199,7 @@ void __init do_init_bootmem(void)
                unsigned long start_pfn, end_pfn;
                start_pfn = memblock_region_memory_base_pfn(reg);
                end_pfn = memblock_region_memory_end_pfn(reg);
-               add_active_range(0, start_pfn, end_pfn);
+               memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
        }
 
        /* Add all physical memory to the bootmem map, mark each area
index b22a83a..e6eea0a 100644 (file)
@@ -127,45 +127,25 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
 }
 
 /*
- * get_active_region_work_fn - A helper function for get_node_active_region
- *     Returns datax set to the start_pfn and end_pfn if they contain
- *     the initial value of datax->start_pfn between them
- * @start_pfn: start page(inclusive) of region to check
- * @end_pfn: end page(exclusive) of region to check
- * @datax: comes in with ->start_pfn set to value to search for and
- *     goes out with active range if it contains it
- * Returns 1 if search value is in range else 0
- */
-static int __init get_active_region_work_fn(unsigned long start_pfn,
-                                       unsigned long end_pfn, void *datax)
-{
-       struct node_active_region *data;
-       data = (struct node_active_region *)datax;
-
-       if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
-               data->start_pfn = start_pfn;
-               data->end_pfn = end_pfn;
-               return 1;
-       }
-       return 0;
-
-}
-
-/*
- * get_node_active_region - Return active region containing start_pfn
+ * get_node_active_region - Return active region containing pfn
  * Active range returned is empty if none found.
- * @start_pfn: The page to return the region for.
- * @node_ar: Returned set to the active region containing start_pfn
+ * @pfn: The page to return the region for
+ * @node_ar: Returned set to the active region containing @pfn
  */
-static void __init get_node_active_region(unsigned long start_pfn,
-                      struct node_active_region *node_ar)
+static void __init get_node_active_region(unsigned long pfn,
+                                         struct node_active_region *node_ar)
 {
-       int nid = early_pfn_to_nid(start_pfn);
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
 
-       node_ar->nid = nid;
-       node_ar->start_pfn = start_pfn;
-       node_ar->end_pfn = start_pfn;
-       work_with_active_regions(nid, get_active_region_work_fn, node_ar);
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               if (pfn >= start_pfn && pfn < end_pfn) {
+                       node_ar->nid = nid;
+                       node_ar->start_pfn = start_pfn;
+                       node_ar->end_pfn = end_pfn;
+                       break;
+               }
+       }
 }
 
 static void map_cpu_to_node(int cpu, int node)
@@ -710,9 +690,7 @@ static void __init parse_drconf_memory(struct device_node *memory)
                        node_set_online(nid);
                        sz = numa_enforce_memory_limit(base, size);
                        if (sz)
-                               add_active_range(nid, base >> PAGE_SHIFT,
-                                                (base >> PAGE_SHIFT)
-                                                + (sz >> PAGE_SHIFT));
+                               memblock_set_node(base, sz, nid);
                } while (--ranges);
        }
 }
@@ -802,8 +780,7 @@ new_range:
                                continue;
                }
 
-               add_active_range(nid, start >> PAGE_SHIFT,
-                               (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
+               memblock_set_node(start, size, nid);
 
                if (--ranges)
                        goto new_range;
@@ -839,7 +816,8 @@ static void __init setup_nonnuma(void)
                end_pfn = memblock_region_memory_end_pfn(reg);
 
                fake_numa_create_new_node(end_pfn, &nid);
-               add_active_range(nid, start_pfn, end_pfn);
+               memblock_set_node(PFN_PHYS(start_pfn),
+                                 PFN_PHYS(end_pfn - start_pfn), nid);
                node_set_online(nid);
        }
 }
index 4e13d6f..573ba3b 100644 (file)
@@ -615,7 +615,6 @@ static void __early_init_mmu(int boot_cpu)
 
                /* limit memory so we dont have linear faults */
                memblock_enforce_memory_limit(linear_map_top);
-               memblock_analyze();
 
                patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
                patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
index 1b5dc1a..6d8dadf 100644 (file)
@@ -79,24 +79,19 @@ void __init wii_memory_fixups(void)
        BUG_ON(memblock.memory.cnt != 2);
        BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base));
 
-       p[0].size = _ALIGN_DOWN(p[0].size, PAGE_SIZE);
-       p[1].size = _ALIGN_DOWN(p[1].size, PAGE_SIZE);
+       /* trim unaligned tail */
+       memblock_remove(ALIGN(p[1].base + p[1].size, PAGE_SIZE),
+                       (phys_addr_t)ULLONG_MAX);
 
-       wii_hole_start = p[0].base + p[0].size;
+       /* determine hole, add & reserve them */
+       wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE);
        wii_hole_size = p[1].base - wii_hole_start;
-
-       pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size);
-       pr_info("HOLE: <%08lx %08lx>\n", wii_hole_start, wii_hole_size);
-       pr_info("MEM2: <%08llx %08llx>\n", p[1].base, p[1].size);
-
-       p[0].size += wii_hole_size + p[1].size;
-
-       memblock.memory.cnt = 1;
-       memblock_analyze();
-
-       /* reserve the hole */
+       memblock_add(wii_hole_start, wii_hole_size);
        memblock_reserve(wii_hole_start, wii_hole_size);
 
+       BUG_ON(memblock.memory.cnt != 1);
+       __memblock_dump_all();
+
        /* allow ioremapping the address space in the hole */
        __allow_ioremap_reserved = 1;
 }
index ea0acbd..8fc6258 100644 (file)
@@ -563,7 +563,8 @@ static void yield_shared_processor(void)
 static void iseries_shared_idle(void)
 {
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched() && !hvlpevent_is_pending()) {
                        local_irq_disable();
                        ppc64_runlatch_off();
@@ -577,7 +578,8 @@ static void iseries_shared_idle(void)
                }
 
                ppc64_runlatch_on();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
 
                if (hvlpevent_is_pending())
                        process_iSeries_events();
@@ -593,7 +595,8 @@ static void iseries_dedicated_idle(void)
        set_thread_flag(TIF_POLLING_NRFLAG);
 
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                if (!need_resched()) {
                        while (!need_resched()) {
                                ppc64_runlatch_off();
@@ -610,7 +613,8 @@ static void iseries_dedicated_idle(void)
                }
 
                ppc64_runlatch_on();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 72714ad..8bd6ba5 100644 (file)
@@ -319,7 +319,6 @@ static int __init ps3_mm_add_memory(void)
        }
 
        memblock_add(start_addr, map.r1.size);
-       memblock_analyze();
 
        result = online_pages(start_pfn, nr_pages);
 
index 27a4950..52d429b 100644 (file)
@@ -555,6 +555,8 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
 
        (*depth)++;
        trace_hcall_entry(opcode, args);
+       if (opcode == H_CEDE)
+               rcu_idle_enter();
        (*depth)--;
 
 out:
@@ -575,6 +577,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
                goto out;
 
        (*depth)++;
+       if (opcode == H_CEDE)
+               rcu_idle_exit();
        trace_hcall_exit(opcode, retval, retbuf);
        (*depth)--;
 
index 373679b..d48ede3 100644 (file)
@@ -92,6 +92,9 @@ config S390
        select HAVE_ARCH_JUMP_LABEL if !MARCH_G5
        select HAVE_RCU_TABLE_FREE if SMP
        select ARCH_SAVE_PAGE_KEYS if HIBERNATION
+       select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
+       select ARCH_DISCARD_MEMBLOCK
        select ARCH_INLINE_SPIN_TRYLOCK
        select ARCH_INLINE_SPIN_TRYLOCK_BH
        select ARCH_INLINE_SPIN_LOCK
@@ -345,9 +348,6 @@ config WARN_DYNAMIC_STACK
 
          Say N if you are unsure.
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 comment "Kernel preemption"
 
 source "kernel/Kconfig.preempt"
index 92f1cb7..4de031d 100644 (file)
@@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
        j = 0;
        for_each_online_cpu(i) {
                os_data->os_cpu[j].per_cpu_user =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.user);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
                os_data->os_cpu[j].per_cpu_nice =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.nice);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
                os_data->os_cpu[j].per_cpu_system =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.system);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
                os_data->os_cpu[j].per_cpu_idle =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.idle);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
                os_data->os_cpu[j].per_cpu_irq =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.irq);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
                os_data->os_cpu[j].per_cpu_softirq =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.softirq);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
                os_data->os_cpu[j].per_cpu_iowait =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.iowait);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
                os_data->os_cpu[j].per_cpu_steal =
-                       cputime_to_jiffies(kstat_cpu(i).cpustat.steal);
+                       cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
                os_data->os_cpu[j].cpu_id = i;
                j++;
        }
index b9acaaa..c23c390 100644 (file)
 
 /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
 
-typedef unsigned long long cputime_t;
-typedef unsigned long long cputime64_t;
+typedef unsigned long long __nocast cputime_t;
+typedef unsigned long long __nocast cputime64_t;
 
-#ifndef __s390x__
-
-static inline unsigned int
-__div(unsigned long long n, unsigned int base)
+static inline unsigned long __div(unsigned long long n, unsigned long base)
 {
+#ifndef __s390x__
        register_pair rp;
 
        rp.pair = n >> 1;
        asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1));
        return rp.subreg.odd;
+#else /* __s390x__ */
+       return n / base;
+#endif /* __s390x__ */
 }
 
-#else /* __s390x__ */
+#define cputime_one_jiffy              jiffies_to_cputime(1)
 
-static inline unsigned int
-__div(unsigned long long n, unsigned int base)
+/*
+ * Convert cputime to jiffies and back.
+ */
+static inline unsigned long cputime_to_jiffies(const cputime_t cputime)
 {
-       return n / base;
+       return __div((__force unsigned long long) cputime, 4096000000ULL / HZ);
 }
 
-#endif /* __s390x__ */
+static inline cputime_t jiffies_to_cputime(const unsigned int jif)
+{
+       return (__force cputime_t)(jif * (4096000000ULL / HZ));
+}
 
-#define cputime_zero                   (0ULL)
-#define cputime_one_jiffy              jiffies_to_cputime(1)
-#define cputime_max                    ((~0UL >> 1) - 1)
-#define cputime_add(__a, __b)          ((__a) +  (__b))
-#define cputime_sub(__a, __b)          ((__a) -  (__b))
-#define cputime_div(__a, __n) ({               \
-       unsigned long long __div = (__a);       \
-       do_div(__div,__n);                      \
-       __div;                                  \
-})
-#define cputime_halve(__a)             ((__a) >> 1)
-#define cputime_eq(__a, __b)           ((__a) == (__b))
-#define cputime_gt(__a, __b)           ((__a) >  (__b))
-#define cputime_ge(__a, __b)           ((__a) >= (__b))
-#define cputime_lt(__a, __b)           ((__a) <  (__b))
-#define cputime_le(__a, __b)           ((__a) <= (__b))
-#define cputime_to_jiffies(__ct)       (__div((__ct), 4096000000ULL / HZ))
-#define cputime_to_scaled(__ct)                (__ct)
-#define jiffies_to_cputime(__hz)       ((cputime_t)(__hz) * (4096000000ULL / HZ))
-
-#define cputime64_zero                 (0ULL)
-#define cputime64_add(__a, __b)                ((__a) + (__b))
-#define cputime_to_cputime64(__ct)     (__ct)
-
-static inline u64
-cputime64_to_jiffies64(cputime64_t cputime)
-{
-       do_div(cputime, 4096000000ULL / HZ);
-       return cputime;
+static inline u64 cputime64_to_jiffies64(cputime64_t cputime)
+{
+       unsigned long long jif = (__force unsigned long long) cputime;
+       do_div(jif, 4096000000ULL / HZ);
+       return jif;
+}
+
+static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
+{
+       return (__force cputime64_t)(jif * (4096000000ULL / HZ));
 }
 
 /*
  * Convert cputime to microseconds and back.
  */
-static inline unsigned int
-cputime_to_usecs(const cputime_t cputime)
+static inline unsigned int cputime_to_usecs(const cputime_t cputime)
 {
-       return cputime_div(cputime, 4096);
+       return (__force unsigned long long) cputime >> 12;
 }
 
-static inline cputime_t
-usecs_to_cputime(const unsigned int m)
+static inline cputime_t usecs_to_cputime(const unsigned int m)
 {
-       return (cputime_t) m * 4096;
+       return (__force cputime_t)(m * 4096ULL);
 }
 
 #define usecs_to_cputime64(m)          usecs_to_cputime(m)
@@ -92,40 +77,39 @@ usecs_to_cputime(const unsigned int m)
 /*
  * Convert cputime to milliseconds and back.
  */
-static inline unsigned int
-cputime_to_secs(const cputime_t cputime)
+static inline unsigned int cputime_to_secs(const cputime_t cputime)
 {
-       return __div(cputime, 2048000000) >> 1;
+       return __div((__force unsigned long long) cputime, 2048000000) >> 1;
 }
 
-static inline cputime_t
-secs_to_cputime(const unsigned int s)
+static inline cputime_t secs_to_cputime(const unsigned int s)
 {
-       return (cputime_t) s * 4096000000ULL;
+       return (__force cputime_t)(s * 4096000000ULL);
 }
 
 /*
  * Convert cputime to timespec and back.
  */
-static inline cputime_t
-timespec_to_cputime(const struct timespec *value)
+static inline cputime_t timespec_to_cputime(const struct timespec *value)
 {
-       return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL;
+       unsigned long long ret = value->tv_sec * 4096000000ULL;
+       return (__force cputime_t)(ret + value->tv_nsec * 4096 / 1000);
 }
 
-static inline void
-cputime_to_timespec(const cputime_t cputime, struct timespec *value)
+static inline void cputime_to_timespec(const cputime_t cputime,
+                                      struct timespec *value)
 {
+       unsigned long long __cputime = (__force unsigned long long) cputime;
 #ifndef __s390x__
        register_pair rp;
 
-       rp.pair = cputime >> 1;
+       rp.pair = __cputime >> 1;
        asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
        value->tv_nsec = rp.subreg.even * 1000 / 4096;
        value->tv_sec = rp.subreg.odd;
 #else
-       value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096;
-       value->tv_sec = cputime / 4096000000ULL;
+       value->tv_nsec = (__cputime % 4096000000ULL) * 1000 / 4096;
+       value->tv_sec = __cputime / 4096000000ULL;
 #endif
 }
 
@@ -134,50 +118,52 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value)
  * Since cputime and timeval have the same resolution (microseconds)
  * this is easy.
  */
-static inline cputime_t
-timeval_to_cputime(const struct timeval *value)
+static inline cputime_t timeval_to_cputime(const struct timeval *value)
 {
-       return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL;
+       unsigned long long ret = value->tv_sec * 4096000000ULL;
+       return (__force cputime_t)(ret + value->tv_usec * 4096ULL);
 }
 
-static inline void
-cputime_to_timeval(const cputime_t cputime, struct timeval *value)
+static inline void cputime_to_timeval(const cputime_t cputime,
+                                     struct timeval *value)
 {
+       unsigned long long __cputime = (__force unsigned long long) cputime;
 #ifndef __s390x__
        register_pair rp;
 
-       rp.pair = cputime >> 1;
+       rp.pair = __cputime >> 1;
        asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
        value->tv_usec = rp.subreg.even / 4096;
        value->tv_sec = rp.subreg.odd;
 #else
-       value->tv_usec = (cputime % 4096000000ULL) / 4096;
-       value->tv_sec = cputime / 4096000000ULL;
+       value->tv_usec = (__cputime % 4096000000ULL) / 4096;
+       value->tv_sec = __cputime / 4096000000ULL;
 #endif
 }
 
 /*
  * Convert cputime to clock and back.
  */
-static inline clock_t
-cputime_to_clock_t(cputime_t cputime)
+static inline clock_t cputime_to_clock_t(cputime_t cputime)
 {
-       return cputime_div(cputime, 4096000000ULL / USER_HZ);
+       unsigned long long clock = (__force unsigned long long) cputime;
+       do_div(clock, 4096000000ULL / USER_HZ);
+       return clock;
 }
 
-static inline cputime_t
-clock_t_to_cputime(unsigned long x)
+static inline cputime_t clock_t_to_cputime(unsigned long x)
 {
-       return (cputime_t) x * (4096000000ULL / USER_HZ);
+       return (__force cputime_t)(x * (4096000000ULL / USER_HZ));
 }
 
 /*
  * Convert cputime64 to clock.
  */
-static inline clock_t
-cputime64_to_clock_t(cputime64_t cputime)
+static inline clock_t cputime64_to_clock_t(cputime64_t cputime)
 {
-       return cputime_div(cputime, 4096000000ULL / USER_HZ);
+       unsigned long long clock = (__force unsigned long long) cputime;
+       do_div(clock, 4096000000ULL / USER_HZ);
+       return clock;
 }
 
 struct s390_idle_data {
index 9451b21..3201ae4 100644 (file)
@@ -91,10 +91,12 @@ static void default_idle(void)
 void cpu_idle(void)
 {
        for (;;) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched())
                        default_idle();
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index e54c4ff..f11d1b0 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/stddef.h>
 #include <linux/unistd.h>
@@ -820,7 +821,8 @@ setup_memory(void)
                end_chunk = min(end_chunk, end_pfn);
                if (start_chunk >= end_chunk)
                        continue;
-               add_active_range(0, start_chunk, end_chunk);
+               memblock_add_node(PFN_PHYS(start_chunk),
+                                 PFN_PHYS(end_chunk - start_chunk), 0);
                pfn = max(start_chunk, start_pfn);
                for (; pfn < end_chunk; pfn++)
                        page_set_storage_key(PFN_PHYS(pfn),
index f43c0e4..9daee91 100644 (file)
@@ -22,6 +22,7 @@
 #include <asm/irq.h>
 
 #include "hwsampler.h"
+#include "op_counter.h"
 
 #define MAX_NUM_SDB 511
 #define MIN_NUM_SDB 1
@@ -896,6 +897,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
                if (sample_data_ptr->P == 1) {
                        /* userspace sample */
                        unsigned int pid = sample_data_ptr->prim_asn;
+                       if (!counter_config.user)
+                               goto skip_sample;
                        rcu_read_lock();
                        tsk = pid_task(find_vpid(pid), PIDTYPE_PID);
                        if (tsk)
@@ -903,6 +906,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
                        rcu_read_unlock();
                } else {
                        /* kernelspace sample */
+                       if (!counter_config.kernel)
+                               goto skip_sample;
                        regs = task_pt_regs(current);
                }
 
@@ -910,7 +915,7 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt,
                oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0,
                                !sample_data_ptr->P, tsk);
                mutex_unlock(&hws_sem);
-
+       skip_sample:
                sample_data_ptr++;
        }
 }
index bd58b72..2297be4 100644 (file)
@@ -2,10 +2,11 @@
  * arch/s390/oprofile/init.c
  *
  * S390 Version
- *   Copyright (C) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *   Copyright (C) 2002-2011 IBM Deutschland Entwicklung GmbH, IBM Corporation
  *   Author(s): Thomas Spatzier (tspat@de.ibm.com)
  *   Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com)
  *   Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com)
+ *   Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
  *
  * @remark Copyright 2002-2011 OProfile authors
  */
@@ -14,6 +15,8 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/module.h>
+#include <asm/processor.h>
 
 #include "../../../drivers/oprofile/oprof.h"
 
@@ -22,6 +25,7 @@ extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth);
 #ifdef CONFIG_64BIT
 
 #include "hwsampler.h"
+#include "op_counter.h"
 
 #define DEFAULT_INTERVAL       4127518
 
@@ -35,16 +39,41 @@ static unsigned long oprofile_max_interval;
 static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS;
 static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS;
 
-static int hwsampler_file;
+static int hwsampler_enabled;
 static int hwsampler_running;  /* start_mutex must be held to change */
+static int hwsampler_available;
 
 static struct oprofile_operations timer_ops;
 
+struct op_counter_config counter_config;
+
+enum __force_cpu_type {
+       reserved = 0,           /* do not force */
+       timer,
+};
+static int force_cpu_type;
+
+static int set_cpu_type(const char *str, struct kernel_param *kp)
+{
+       if (!strcmp(str, "timer")) {
+               force_cpu_type = timer;
+               printk(KERN_INFO "oprofile: forcing timer to be returned "
+                                "as cpu type\n");
+       } else {
+               force_cpu_type = 0;
+       }
+
+       return 0;
+}
+module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
+MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling"
+                          "(report cpu_type \"timer\"");
+
 static int oprofile_hwsampler_start(void)
 {
        int retval;
 
-       hwsampler_running = hwsampler_file;
+       hwsampler_running = hwsampler_enabled;
 
        if (!hwsampler_running)
                return timer_ops.start();
@@ -72,10 +101,16 @@ static void oprofile_hwsampler_stop(void)
        return;
 }
 
+/*
+ * File ops used for:
+ * /dev/oprofile/0/enabled
+ * /dev/oprofile/hwsampling/hwsampler  (cpu_type = timer)
+ */
+
 static ssize_t hwsampler_read(struct file *file, char __user *buf,
                size_t count, loff_t *offset)
 {
-       return oprofilefs_ulong_to_user(hwsampler_file, buf, count, offset);
+       return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset);
 }
 
 static ssize_t hwsampler_write(struct file *file, char const __user *buf,
@@ -91,6 +126,9 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf,
        if (retval <= 0)
                return retval;
 
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
        if (oprofile_started)
                /*
                 * save to do without locking as we set
@@ -99,7 +137,7 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf,
                 */
                return -EBUSY;
 
-       hwsampler_file = val;
+       hwsampler_enabled = val;
 
        return count;
 }
@@ -109,38 +147,311 @@ static const struct file_operations hwsampler_fops = {
        .write          = hwsampler_write,
 };
 
+/*
+ * File ops used for:
+ * /dev/oprofile/0/count
+ * /dev/oprofile/hwsampling/hw_interval  (cpu_type = timer)
+ *
+ * Make sure that the value is within the hardware range.
+ */
+
+static ssize_t hw_interval_read(struct file *file, char __user *buf,
+                               size_t count, loff_t *offset)
+{
+       return oprofilefs_ulong_to_user(oprofile_hw_interval, buf,
+                                       count, offset);
+}
+
+static ssize_t hw_interval_write(struct file *file, char const __user *buf,
+                                size_t count, loff_t *offset)
+{
+       unsigned long val;
+       int retval;
+
+       if (*offset)
+               return -EINVAL;
+       retval = oprofilefs_ulong_from_user(&val, buf, count);
+       if (retval)
+               return retval;
+       if (val < oprofile_min_interval)
+               oprofile_hw_interval = oprofile_min_interval;
+       else if (val > oprofile_max_interval)
+               oprofile_hw_interval = oprofile_max_interval;
+       else
+               oprofile_hw_interval = val;
+
+       return count;
+}
+
+static const struct file_operations hw_interval_fops = {
+       .read           = hw_interval_read,
+       .write          = hw_interval_write,
+};
+
+/*
+ * File ops used for:
+ * /dev/oprofile/0/event
+ * Only a single event with number 0 is supported with this counter.
+ *
+ * /dev/oprofile/0/unit_mask
+ * This is a dummy file needed by the user space tools.
+ * No value other than 0 is accepted or returned.
+ */
+
+static ssize_t hwsampler_zero_read(struct file *file, char __user *buf,
+                                   size_t count, loff_t *offset)
+{
+       return oprofilefs_ulong_to_user(0, buf, count, offset);
+}
+
+static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf,
+                                    size_t count, loff_t *offset)
+{
+       unsigned long val;
+       int retval;
+
+       if (*offset)
+               return -EINVAL;
+
+       retval = oprofilefs_ulong_from_user(&val, buf, count);
+       if (retval)
+               return retval;
+       if (val != 0)
+               return -EINVAL;
+       return count;
+}
+
+static const struct file_operations zero_fops = {
+       .read           = hwsampler_zero_read,
+       .write          = hwsampler_zero_write,
+};
+
+/* /dev/oprofile/0/kernel file ops.  */
+
+static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf,
+                                    size_t count, loff_t *offset)
+{
+       return oprofilefs_ulong_to_user(counter_config.kernel,
+                                       buf, count, offset);
+}
+
+static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf,
+                                     size_t count, loff_t *offset)
+{
+       unsigned long val;
+       int retval;
+
+       if (*offset)
+               return -EINVAL;
+
+       retval = oprofilefs_ulong_from_user(&val, buf, count);
+       if (retval)
+               return retval;
+
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
+       counter_config.kernel = val;
+
+       return count;
+}
+
+static const struct file_operations kernel_fops = {
+       .read           = hwsampler_kernel_read,
+       .write          = hwsampler_kernel_write,
+};
+
+/* /dev/oprofile/0/user file ops. */
+
+static ssize_t hwsampler_user_read(struct file *file, char __user *buf,
+                                  size_t count, loff_t *offset)
+{
+       return oprofilefs_ulong_to_user(counter_config.user,
+                                       buf, count, offset);
+}
+
+static ssize_t hwsampler_user_write(struct file *file, char const __user *buf,
+                                   size_t count, loff_t *offset)
+{
+       unsigned long val;
+       int retval;
+
+       if (*offset)
+               return -EINVAL;
+
+       retval = oprofilefs_ulong_from_user(&val, buf, count);
+       if (retval)
+               return retval;
+
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
+       counter_config.user = val;
+
+       return count;
+}
+
+static const struct file_operations user_fops = {
+       .read           = hwsampler_user_read,
+       .write          = hwsampler_user_write,
+};
+
+
+/*
+ * File ops used for: /dev/oprofile/timer/enabled
+ * The value always has to be the inverted value of hwsampler_enabled. So
+ * no separate variable is created. That way we do not need locking.
+ */
+
+static ssize_t timer_enabled_read(struct file *file, char __user *buf,
+                                 size_t count, loff_t *offset)
+{
+       return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset);
+}
+
+static ssize_t timer_enabled_write(struct file *file, char const __user *buf,
+                                  size_t count, loff_t *offset)
+{
+       unsigned long val;
+       int retval;
+
+       if (*offset)
+               return -EINVAL;
+
+       retval = oprofilefs_ulong_from_user(&val, buf, count);
+       if (retval)
+               return retval;
+
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
+       /* Timer cannot be disabled without having hardware sampling.  */
+       if (val == 0 && !hwsampler_available)
+               return -EINVAL;
+
+       if (oprofile_started)
+               /*
+                * save to do without locking as we set
+                * hwsampler_running in start() when start_mutex is
+                * held
+                */
+               return -EBUSY;
+
+       hwsampler_enabled = !val;
+
+       return count;
+}
+
+static const struct file_operations timer_enabled_fops = {
+       .read           = timer_enabled_read,
+       .write          = timer_enabled_write,
+};
+
+
 static int oprofile_create_hwsampling_files(struct super_block *sb,
-                                               struct dentry *root)
+                                           struct dentry *root)
 {
-       struct dentry *hw_dir;
+       struct dentry *dir;
+
+       dir = oprofilefs_mkdir(sb, root, "timer");
+       if (!dir)
+               return -EINVAL;
+
+       oprofilefs_create_file(sb, dir, "enabled", &timer_enabled_fops);
+
+       if (!hwsampler_available)
+               return 0;
 
        /* reinitialize default values */
-       hwsampler_file = 1;
+       hwsampler_enabled = 1;
+       counter_config.kernel = 1;
+       counter_config.user = 1;
 
-       hw_dir = oprofilefs_mkdir(sb, root, "hwsampling");
-       if (!hw_dir)
-               return -EINVAL;
+       if (!force_cpu_type) {
+               /*
+                * Create the counter file system.  A single virtual
+                * counter is created which can be used to
+                * enable/disable hardware sampling dynamically from
+                * user space.  The user space will configure a single
+                * counter with a single event.  The value of 'event'
+                * and 'unit_mask' are not evaluated by the kernel code
+                * and can only be set to 0.
+                */
+
+               dir = oprofilefs_mkdir(sb, root, "0");
+               if (!dir)
+                       return -EINVAL;
 
-       oprofilefs_create_file(sb, hw_dir, "hwsampler", &hwsampler_fops);
-       oprofilefs_create_ulong(sb, hw_dir, "hw_interval",
-                               &oprofile_hw_interval);
-       oprofilefs_create_ro_ulong(sb, hw_dir, "hw_min_interval",
-                               &oprofile_min_interval);
-       oprofilefs_create_ro_ulong(sb, hw_dir, "hw_max_interval",
-                               &oprofile_max_interval);
-       oprofilefs_create_ulong(sb, hw_dir, "hw_sdbt_blocks",
-                               &oprofile_sdbt_blocks);
+               oprofilefs_create_file(sb, dir, "enabled", &hwsampler_fops);
+               oprofilefs_create_file(sb, dir, "event", &zero_fops);
+               oprofilefs_create_file(sb, dir, "count", &hw_interval_fops);
+               oprofilefs_create_file(sb, dir, "unit_mask", &zero_fops);
+               oprofilefs_create_file(sb, dir, "kernel", &kernel_fops);
+               oprofilefs_create_file(sb, dir, "user", &user_fops);
+               oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks",
+                                       &oprofile_sdbt_blocks);
 
+       } else {
+               /*
+                * Hardware sampling can be used but the cpu_type is
+                * forced to timer in order to deal with legacy user
+                * space tools.  The /dev/oprofile/hwsampling fs is
+                * provided in that case.
+                */
+               dir = oprofilefs_mkdir(sb, root, "hwsampling");
+               if (!dir)
+                       return -EINVAL;
+
+               oprofilefs_create_file(sb, dir, "hwsampler",
+                                      &hwsampler_fops);
+               oprofilefs_create_file(sb, dir, "hw_interval",
+                                      &hw_interval_fops);
+               oprofilefs_create_ro_ulong(sb, dir, "hw_min_interval",
+                                          &oprofile_min_interval);
+               oprofilefs_create_ro_ulong(sb, dir, "hw_max_interval",
+                                          &oprofile_max_interval);
+               oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks",
+                                       &oprofile_sdbt_blocks);
+       }
        return 0;
 }
 
 static int oprofile_hwsampler_init(struct oprofile_operations *ops)
 {
+       /*
+        * Initialize the timer mode infrastructure as well in order
+        * to be able to switch back dynamically.  oprofile_timer_init
+        * is not supposed to fail.
+        */
+       if (oprofile_timer_init(ops))
+               BUG();
+
+       memcpy(&timer_ops, ops, sizeof(timer_ops));
+       ops->create_files = oprofile_create_hwsampling_files;
+
+       /*
+        * If the user space tools do not support newer cpu types,
+        * the force_cpu_type module parameter
+        * can be used to always return \"timer\" as cpu type.
+        */
+       if (force_cpu_type != timer) {
+               struct cpuid id;
+
+               get_cpu_id (&id);
+
+               switch (id.machine) {
+               case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break;
+               case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break;
+               default: return -ENODEV;
+               }
+       }
+
        if (hwsampler_setup())
                return -ENODEV;
 
        /*
-        * create hwsampler files only if hwsampler_setup() succeeds.
+        * Query the range for the sampling interval from the
+        * hardware.
         */
        oprofile_min_interval = hwsampler_query_min_interval();
        if (oprofile_min_interval == 0)
@@ -155,23 +466,17 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops)
        if (oprofile_hw_interval > oprofile_max_interval)
                oprofile_hw_interval = oprofile_max_interval;
 
-       if (oprofile_timer_init(ops))
-               return -ENODEV;
-
-       printk(KERN_INFO "oprofile: using hardware sampling\n");
-
-       memcpy(&timer_ops, ops, sizeof(timer_ops));
+       printk(KERN_INFO "oprofile: System z hardware sampling "
+              "facility found.\n");
 
        ops->start = oprofile_hwsampler_start;
        ops->stop = oprofile_hwsampler_stop;
-       ops->create_files = oprofile_create_hwsampling_files;
 
        return 0;
 }
 
 static void oprofile_hwsampler_exit(void)
 {
-       oprofile_timer_exit();
        hwsampler_shutdown();
 }
 
@@ -182,7 +487,15 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
        ops->backtrace = s390_backtrace;
 
 #ifdef CONFIG_64BIT
-       return oprofile_hwsampler_init(ops);
+
+       /*
+        * -ENODEV is not reported to the caller.  The module itself
+         * will use the timer mode sampling as fallback and this is
+         * always available.
+        */
+       hwsampler_available = oprofile_hwsampler_init(ops) == 0;
+
+       return 0;
 #else
        return -ENODEV;
 #endif
diff --git a/arch/s390/oprofile/op_counter.h b/arch/s390/oprofile/op_counter.h
new file mode 100644 (file)
index 0000000..1a8d3ca
--- /dev/null
@@ -0,0 +1,23 @@
+/**
+ * arch/s390/oprofile/op_counter.h
+ *
+ *   Copyright (C) 2011 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *   Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com)
+ *
+ * @remark Copyright 2011 OProfile authors
+ */
+
+#ifndef OP_COUNTER_H
+#define OP_COUNTER_H
+
+struct op_counter_config {
+       /* `enabled' maps to the hwsampler_file variable.  */
+       /* `count' maps to the oprofile_hw_interval variable.  */
+       /* `event' and `unit_mask' are unused. */
+       unsigned long kernel;
+       unsigned long user;
+};
+
+extern struct op_counter_config counter_config;
+
+#endif /* OP_COUNTER_H */
index df169e8..8b0c946 100644 (file)
@@ -4,6 +4,9 @@ config SCORE
        def_bool y
        select HAVE_GENERIC_HARDIRQS
        select GENERIC_IRQ_SHOW
+       select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
+       select ARCH_DISCARD_MEMBLOCK
 
 choice
        prompt "System type"
@@ -60,9 +63,6 @@ config 32BIT
 config ARCH_FLATMEM_ENABLE
        def_bool y
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 source "mm/Kconfig"
 
 config MEMORY_START
index 6f898c0..b48459a 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/bootmem.h>
 #include <linux/initrd.h>
 #include <linux/ioport.h>
+#include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/screen_info.h>
@@ -54,7 +55,8 @@ static void __init bootmem_init(void)
        /* Initialize the boot-time allocator with low memory only. */
        bootmap_size = init_bootmem_node(NODE_DATA(0), start_pfn,
                                         min_low_pfn, max_low_pfn);
-       add_active_range(0, min_low_pfn, max_low_pfn);
+       memblock_add_node(PFN_PHYS(min_low_pfn),
+                         PFN_PHYS(max_low_pfn - min_low_pfn), 0);
 
        free_bootmem(PFN_PHYS(start_pfn),
                     (max_low_pfn - start_pfn) << PAGE_SHIFT);
index 5629e20..47a2f1c 100644 (file)
@@ -4,6 +4,7 @@ config SUPERH
        select CLKDEV_LOOKUP
        select HAVE_IDE if HAS_IOPORT
        select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_OPROFILE
        select HAVE_GENERIC_DMA_COHERENT
        select HAVE_ARCH_TRACEHOOK
diff --git a/arch/sh/include/asm/memblock.h b/arch/sh/include/asm/memblock.h
deleted file mode 100644 (file)
index e87063f..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __ASM_SH_MEMBLOCK_H
-#define __ASM_SH_MEMBLOCK_H
-
-#endif /* __ASM_SH_MEMBLOCK_H */
index db4ecd7..406508d 100644 (file)
@@ -89,7 +89,8 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
 
                while (!need_resched()) {
                        check_pgt_cache();
@@ -111,7 +112,8 @@ void cpu_idle(void)
                        start_critical_timings();
                }
 
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index c5a33f0..9fea49f 100644 (file)
@@ -157,9 +157,6 @@ void __init reserve_crashkernel(void)
        unsigned long long crash_size, crash_base;
        int ret;
 
-       /* this is necessary because of memblock_phys_mem_size() */
-       memblock_analyze();
-
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                        &crash_size, &crash_base);
        if (ret == 0 && crash_size > 0) {
index 1a0e946..7b57bf1 100644 (file)
@@ -230,7 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
        pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
                         PAGE_KERNEL);
 
-       add_active_range(nid, start_pfn, end_pfn);
+       memblock_set_node(PFN_PHYS(start_pfn),
+                         PFN_PHYS(end_pfn - start_pfn), nid);
 }
 
 void __init __weak plat_early_device_setup(void)
index c3e61b3..cb8f992 100644 (file)
@@ -143,9 +143,6 @@ config MAX_ACTIVE_REGIONS
                       CPU_SUBTYPE_SH7785)
        default "1"
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y
 
index 939ca0f..82cc576 100644 (file)
@@ -324,7 +324,6 @@ void __init paging_init(void)
        unsigned long vaddr, end;
        int nid;
 
-       memblock_init();
        sh_mv.mv_mem_init();
 
        early_reserve_mem();
@@ -337,7 +336,7 @@ void __init paging_init(void)
                sh_mv.mv_mem_reserve();
 
        memblock_enforce_memory_limit(memory_limit);
-       memblock_analyze();
+       memblock_allow_resize();
 
        memblock_dump_all();
 
index f92602e..70ae9d8 100644 (file)
@@ -43,6 +43,7 @@ config SPARC64
        select HAVE_KPROBES
        select HAVE_RCU_TABLE_FREE if SMP
        select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
        select HAVE_SYSCALL_WRAPPERS
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FTRACE_MCOUNT_RECORD
@@ -352,9 +353,6 @@ config NODES_SPAN_OTHER_NODES
        def_bool y
        depends on NEED_MULTIPLE_NODES
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y if SPARC64
-
 config ARCH_SELECT_MEMORY_MODEL
        def_bool y if SPARC64
 
diff --git a/arch/sparc/include/asm/memblock.h b/arch/sparc/include/asm/memblock.h
deleted file mode 100644 (file)
index c67b047..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _SPARC64_MEMBLOCK_H
-#define _SPARC64_MEMBLOCK_H
-
-#include <asm/oplib.h>
-
-#define MEMBLOCK_DBG(fmt...) prom_printf(fmt)
-
-#endif /* !(_SPARC64_MEMBLOCK_H) */
index 3739a06..39d8b05 100644 (file)
@@ -95,12 +95,14 @@ void cpu_idle(void)
        set_thread_flag(TIF_POLLING_NRFLAG);
 
        while(1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
 
                while (!need_resched() && !cpu_is_offline(cpu))
                        sparc64_yield(cpu);
 
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
 
                preempt_enable_no_resched();
 
index fe1e3fc..ffb883d 100644 (file)
@@ -84,7 +84,7 @@ static void prom_sync_me(void)
 
        prom_printf("PROM SYNC COMMAND...\n");
        show_free_areas(0);
-       if(current->pid != 0) {
+       if (!is_idle_task(current)) {
                local_irq_enable();
                sys_sync();
                local_irq_disable();
index 8e073d8..b3f5e7d 100644 (file)
@@ -790,7 +790,7 @@ static int find_node(unsigned long addr)
        return -1;
 }
 
-u64 memblock_nid_range(u64 start, u64 end, int *nid)
+static u64 memblock_nid_range(u64 start, u64 end, int *nid)
 {
        *nid = find_node(start);
        start += PAGE_SIZE;
@@ -808,7 +808,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid)
        return start;
 }
 #else
-u64 memblock_nid_range(u64 start, u64 end, int *nid)
+static u64 memblock_nid_range(u64 start, u64 end, int *nid)
 {
        *nid = 0;
        return end;
@@ -816,7 +816,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid)
 #endif
 
 /* This must be invoked after performing all of the necessary
- * add_active_range() calls for 'nid'.  We need to be able to get
+ * memblock_set_node() calls for 'nid'.  We need to be able to get
  * correct data from get_pfn_range_for_nid().
  */
 static void __init allocate_node_data(int nid)
@@ -987,14 +987,11 @@ static void __init add_node_ranges(void)
 
                        this_end = memblock_nid_range(start, end, &nid);
 
-                       numadbg("Adding active range nid[%d] "
+                       numadbg("Setting memblock NUMA node nid[%d] "
                                "start[%lx] end[%lx]\n",
                                nid, start, this_end);
 
-                       add_active_range(nid,
-                                        start >> PAGE_SHIFT,
-                                        this_end >> PAGE_SHIFT);
-
+                       memblock_set_node(start, this_end - start, nid);
                        start = this_end;
                }
        }
@@ -1282,7 +1279,6 @@ static void __init bootmem_init_nonnuma(void)
 {
        unsigned long top_of_ram = memblock_end_of_DRAM();
        unsigned long total_ram = memblock_phys_mem_size();
-       struct memblock_region *reg;
 
        numadbg("bootmem_init_nonnuma()\n");
 
@@ -1292,20 +1288,8 @@ static void __init bootmem_init_nonnuma(void)
               (top_of_ram - total_ram) >> 20);
 
        init_node_masks_nonnuma();
-
-       for_each_memblock(memory, reg) {
-               unsigned long start_pfn, end_pfn;
-
-               if (!reg->size)
-                       continue;
-
-               start_pfn = memblock_region_memory_base_pfn(reg);
-               end_pfn = memblock_region_memory_end_pfn(reg);
-               add_active_range(0, start_pfn, end_pfn);
-       }
-
+       memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
        allocate_node_data(0);
-
        node_set_online(0);
 }
 
@@ -1769,8 +1753,6 @@ void __init paging_init(void)
                sun4v_ktsb_init();
        }
 
-       memblock_init();
-
        /* Find available physical memory...
         *
         * Read it twice in order to work around a bug in openfirmware.
@@ -1796,7 +1778,7 @@ void __init paging_init(void)
 
        memblock_enforce_memory_limit(cmdline_memory_size);
 
-       memblock_analyze();
+       memblock_allow_resize();
        memblock_dump_all();
 
        set_bit(0, mmu_context_bmap);
index 9c45d8b..4c1ac6e 100644 (file)
@@ -85,7 +85,8 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched()) {
                        if (cpu_is_offline(cpu))
                                BUG();  /* no HOTPLUG_CPU */
@@ -105,7 +106,8 @@ void cpu_idle(void)
                                local_irq_enable();
                        current_thread_info()->status |= TS_POLLING;
                }
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 25b7b90..c1eaaa1 100644 (file)
@@ -54,7 +54,7 @@ static noinline void force_sig_info_fault(const char *type, int si_signo,
        if (unlikely(tsk->pid < 2)) {
                panic("Signal %d (code %d) at %#lx sent to %s!",
                      si_signo, si_code & 0xffff, address,
-                     tsk->pid ? "init" : "the idle task");
+                     is_idle_task(tsk) ? "the idle task" : "init");
        }
 
        info.si_signo = si_signo;
@@ -515,7 +515,7 @@ no_context:
 
        if (unlikely(tsk->pid < 2)) {
                panic("Kernel page fault running %s!",
-                     tsk->pid ? "init" : "the idle task");
+                     is_idle_task(tsk) ? "the idle task" : "init");
        }
 
        /*
index c533835..69f2490 100644 (file)
@@ -246,10 +246,12 @@ void default_idle(void)
                if (need_resched())
                        schedule();
 
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                nsecs = disable_timer();
                idle_sleep(nsecs);
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
        }
 }
 
index a08d9fa..82a6e22 100644 (file)
@@ -75,8 +75,6 @@ static struct clocksource itimer_clocksource = {
        .rating         = 300,
        .read           = itimer_read,
        .mask           = CLOCKSOURCE_MASK(64),
-       .mult           = 1000,
-       .shift          = 0,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -94,9 +92,9 @@ static void __init setup_itimer(void)
                clockevent_delta2ns(60 * HZ, &itimer_clockevent);
        itimer_clockevent.min_delta_ns =
                clockevent_delta2ns(1, &itimer_clockevent);
-       err = clocksource_register(&itimer_clocksource);
+       err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC);
        if (err) {
-               printk(KERN_ERR "clocksource_register returned %d\n", err);
+               printk(KERN_ERR "clocksource_register_hz returned %d\n", err);
                return;
        }
        clockevents_register_device(&itimer_clockevent);
index ba401df..52edc2b 100644 (file)
@@ -55,7 +55,8 @@ void cpu_idle(void)
 {
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched()) {
                        local_irq_disable();
                        stop_critical_timings();
@@ -63,7 +64,8 @@ void cpu_idle(void)
                        local_irq_enable();
                        start_critical_timings();
                }
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 471b6bc..673d7a8 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
+#include <asm/memblock.h>
 
 #include "setup.h"
 
index 3b379cd..de186bd 100644 (file)
@@ -26,6 +26,7 @@
 #include <asm/setup.h>
 #include <asm/sizes.h>
 #include <asm/tlb.h>
+#include <asm/memblock.h>
 #include <mach/map.h>
 
 #include "mm.h"
@@ -245,7 +246,6 @@ void __init uc32_memblock_init(struct meminfo *mi)
        sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]),
                meminfo_cmp, NULL);
 
-       memblock_init();
        for (i = 0; i < mi->nr_banks; i++)
                memblock_add(mi->bank[i].start, mi->bank[i].size);
 
@@ -264,7 +264,7 @@ void __init uc32_memblock_init(struct meminfo *mi)
 
        uc32_mm_memblock_reserve();
 
-       memblock_analyze();
+       memblock_allow_resize();
        memblock_dump_all();
 }
 
index 3e5c3e5..43c20b4 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/setup.h>
 #include <asm/sizes.h>
 #include <asm/tlb.h>
+#include <asm/memblock.h>
 
 #include <mach/map.h>
 
index efb4294..5731eb7 100644 (file)
@@ -26,6 +26,8 @@ config X86
        select HAVE_IOREMAP_PROT
        select HAVE_KPROBES
        select HAVE_MEMBLOCK
+       select HAVE_MEMBLOCK_NODE_MAP
+       select ARCH_DISCARD_MEMBLOCK
        select ARCH_WANT_OPTIONAL_GPIOLIB
        select ARCH_WANT_FRAME_POINTERS
        select HAVE_DMA_ATTRS
@@ -204,9 +206,6 @@ config ZONE_DMA32
        bool
        default X86_64
 
-config ARCH_POPULATES_NODE_MAP
-       def_bool y
-
 config AUDIT_ARCH
        bool
        default X86_64
@@ -343,6 +342,7 @@ config X86_EXTENDED_PLATFORM
 
          If you enable this option then you'll be able to select support
          for the following (non-PC) 64 bit x86 platforms:
+               Numascale NumaChip
                ScaleMP vSMP
                SGI Ultraviolet
 
@@ -351,6 +351,18 @@ config X86_EXTENDED_PLATFORM
 endif
 # This is an alphabetically sorted list of 64 bit extended platforms
 # Please maintain the alphabetic order if and when there are additions
+config X86_NUMACHIP
+       bool "Numascale NumaChip"
+       depends on X86_64
+       depends on X86_EXTENDED_PLATFORM
+       depends on NUMA
+       depends on SMP
+       depends on X86_X2APIC
+       depends on !EDAC_AMD64
+       ---help---
+         Adds support for Numascale NumaChip large-SMP systems. Needed to
+         enable more than ~168 cores.
+         If you don't have one of these, you should say N here.
 
 config X86_VSMP
        bool "ScaleMP vSMP"
index a6253ec..3e27456 100644 (file)
@@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target)
        CFI_REL_OFFSET rsp,0
        pushfq_cfi
        /*CFI_REL_OFFSET rflags,0*/
-       movl    8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
+       movl    TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
        CFI_REGISTER rip,r10
        pushq_cfi $__USER32_CS
        /*CFI_REL_OFFSET cs,0*/
@@ -150,9 +150,8 @@ ENTRY(ia32_sysenter_target)
        .section __ex_table,"a"
        .quad 1b,ia32_badarg
        .previous       
-       GET_THREAD_INFO(%r10)
-       orl    $TS_COMPAT,TI_status(%r10)
-       testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        CFI_REMEMBER_STATE
        jnz  sysenter_tracesys
        cmpq    $(IA32_NR_syscalls-1),%rax
@@ -162,13 +161,12 @@ sysenter_do_call:
 sysenter_dispatch:
        call    *ia32_sys_call_table(,%rax,8)
        movq    %rax,RAX-ARGOFFSET(%rsp)
-       GET_THREAD_INFO(%r10)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl   $_TIF_ALLWORK_MASK,TI_flags(%r10)
+       testl   $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jnz     sysexit_audit
 sysexit_from_sys_call:
-       andl    $~TS_COMPAT,TI_status(%r10)
+       andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        /* clear IF, that popfq doesn't enable interrupts early */
        andl  $~0x200,EFLAGS-R11(%rsp) 
        movl    RIP-R11(%rsp),%edx              /* User %eip */
@@ -205,7 +203,7 @@ sysexit_from_sys_call:
        .endm
 
        .macro auditsys_exit exit
-       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jnz ia32_ret_from_sys_call
        TRACE_IRQS_ON
        sti
@@ -215,12 +213,11 @@ sysexit_from_sys_call:
        movzbl %al,%edi         /* zero-extend that into %edi */
        inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
        call audit_syscall_exit
-       GET_THREAD_INFO(%r10)
        movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall return value */
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        cli
        TRACE_IRQS_OFF
-       testl %edi,TI_flags(%r10)
+       testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jz \exit
        CLEAR_RREGS -ARGOFFSET
        jmp int_with_check
@@ -238,7 +235,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-       testl   $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+       testl   $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jz      sysenter_auditsys
 #endif
        SAVE_REST
@@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target)
        .section __ex_table,"a"
        .quad 1b,ia32_badarg
        .previous       
-       GET_THREAD_INFO(%r10)
-       orl   $TS_COMPAT,TI_status(%r10)
-       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        CFI_REMEMBER_STATE
        jnz   cstar_tracesys
        cmpq $IA32_NR_syscalls-1,%rax
@@ -321,13 +317,12 @@ cstar_do_call:
 cstar_dispatch:
        call *ia32_sys_call_table(,%rax,8)
        movq %rax,RAX-ARGOFFSET(%rsp)
-       GET_THREAD_INFO(%r10)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+       testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jnz sysretl_audit
 sysretl_from_sys_call:
-       andl $~TS_COMPAT,TI_status(%r10)
+       andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        RESTORE_ARGS 0,-ARG_SKIP,0,0,0
        movl RIP-ARGOFFSET(%rsp),%ecx
        CFI_REGISTER rip,rcx
@@ -355,7 +350,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jz cstar_auditsys
 #endif
        xchgl %r9d,%ebp
@@ -420,9 +415,8 @@ ENTRY(ia32_syscall)
        /* note the registers are not zero extended to the sf.
           this could be a problem. */
        SAVE_ARGS 0,1,0
-       GET_THREAD_INFO(%r10)
-       orl   $TS_COMPAT,TI_status(%r10)
-       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+       orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jnz ia32_tracesys
        cmpq $(IA32_NR_syscalls-1),%rax
        ja ia32_badsys
@@ -459,8 +453,8 @@ quiet_ni_syscall:
        CFI_ENDPROC
        
        .macro PTREGSCALL label, func, arg
-       .globl \label
-\label:
+       ALIGN
+GLOBAL(\label)
        leaq \func(%rip),%rax
        leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
        jmp  ia32_ptregs_common 
@@ -477,7 +471,8 @@ quiet_ni_syscall:
        PTREGSCALL stub32_vfork, sys_vfork, %rdi
        PTREGSCALL stub32_iopl, sys_iopl, %rsi
 
-ENTRY(ia32_ptregs_common)
+       ALIGN
+ia32_ptregs_common:
        popq %r11
        CFI_ENDPROC
        CFI_STARTPROC32 simple
index 091508b..952bd01 100644 (file)
@@ -4,10 +4,10 @@
 
 #ifdef CONFIG_SMP
        .macro LOCK_PREFIX
-1:     lock
+672:   lock
        .section .smp_locks,"a"
        .balign 4
-       .long 1b - .
+       .long 672b - .
        .previous
        .endm
 #else
index 1a6c09a..3ab9bdd 100644 (file)
@@ -176,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void)
 }
 
 extern int x2apic_phys;
+extern int x2apic_preenabled;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
@@ -198,6 +199,9 @@ static inline void x2apic_force_phys(void)
        x2apic_phys = 1;
 }
 #else
+static inline void disable_x2apic(void)
+{
+}
 static inline void check_x2apic(void)
 {
 }
@@ -212,6 +216,7 @@ static inline void x2apic_force_phys(void)
 {
 }
 
+#define        nox2apic        0
 #define        x2apic_preenabled 0
 #define        x2apic_supported()      0
 #endif
@@ -410,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
+
 static inline u32 apic_read(u32 reg)
 {
        return apic->read(reg);
diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h
new file mode 100644 (file)
index 0000000..a2d3127
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_APIC_FLAT_64_H
+#define _ASM_X86_APIC_FLAT_64_H
+
+extern void flat_init_apic_ldr(void);
+
+#endif
+
index 3925d80..134bba0 100644 (file)
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 #define APIC_BASE_MSR  0x800
+#define XAPIC_ENABLE   (1UL << 11)
 #define X2APIC_ENABLE  (1UL << 10)
 
 #ifdef CONFIG_X86_32
index 1775d6e..b97596e 100644 (file)
@@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word)
        return word;
 }
 
+#undef ADDR
+
 #ifdef __KERNEL__
 /**
  * ffs - find first set bit in word
@@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
 static inline int ffs(int x)
 {
        int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+       /*
+        * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+        * dest reg is undefined if x==0, but their CPU architect says its
+        * value is written to set it to the same as before, except that the
+        * top 32 bits will be cleared.
+        *
+        * We cannot do this on 32 bits because at the very least some
+        * 486 CPUs did not behave this way.
+        */
+       long tmp = -1;
+       asm("bsfl %1,%0"
+           : "=r" (r)
+           : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
        asm("bsfl %1,%0\n\t"
            "cmovzl %2,%0"
-           : "=r" (r) : "rm" (x), "r" (-1));
+           : "=&r" (r) : "rm" (x), "r" (-1));
 #else
        asm("bsfl %1,%0\n\t"
            "jnz 1f\n\t"
@@ -422,7 +439,22 @@ static inline int ffs(int x)
 static inline int fls(int x)
 {
        int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+       /*
+        * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+        * dest reg is undefined if x==0, but their CPU architect says its
+        * value is written to set it to the same as before, except that the
+        * top 32 bits will be cleared.
+        *
+        * We cannot do this on 32 bits because at the very least some
+        * 486 CPUs did not behave this way.
+        */
+       long tmp = -1;
+       asm("bsrl %1,%0"
+           : "=r" (r)
+           : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
        asm("bsrl %1,%0\n\t"
            "cmovzl %2,%0"
            : "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,11 +466,35 @@ static inline int fls(int x)
 #endif
        return r + 1;
 }
-#endif /* __KERNEL__ */
-
-#undef ADDR
 
-#ifdef __KERNEL__
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline int fls64(__u64 x)
+{
+       long bitpos = -1;
+       /*
+        * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+        * dest reg is undefined if x==0, but their CPU architect says its
+        * value is written to set it to the same as before.
+        */
+       asm("bsrq %1,%0"
+           : "+r" (bitpos)
+           : "rm" (x));
+       return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
 
 #include <asm-generic/bitops/find.h>
 
@@ -450,12 +506,6 @@ static inline int fls(int x)
 
 #include <asm-generic/bitops/const_hweight.h>
 
-#endif /* __KERNEL__ */
-
-#include <asm-generic/bitops/fls64.h>
-
-#ifdef __KERNEL__
-
 #include <asm-generic/bitops/le.h>
 
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
index 5d3acdf..0c9fa27 100644 (file)
@@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void)
        __compiletime_error("Bad argument size for cmpxchg");
 extern void __xadd_wrong_size(void)
        __compiletime_error("Bad argument size for xadd");
+extern void __add_wrong_size(void)
+       __compiletime_error("Bad argument size for add");
 
 /*
  * Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void)
 #define        __X86_CASE_Q    -1              /* sizeof will never return -1 */
 #endif
 
+/* 
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns a the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock)                                  \
+       ({                                                              \
+               __typeof__ (*(ptr)) __ret = (arg);                      \
+               switch (sizeof(*(ptr))) {                               \
+               case __X86_CASE_B:                                      \
+                       asm volatile (lock #op "b %b0, %1\n"            \
+                                     : "+r" (__ret), "+m" (*(ptr))     \
+                                     : : "memory", "cc");              \
+                       break;                                          \
+               case __X86_CASE_W:                                      \
+                       asm volatile (lock #op "w %w0, %1\n"            \
+                                     : "+r" (__ret), "+m" (*(ptr))     \
+                                     : : "memory", "cc");              \
+                       break;                                          \
+               case __X86_CASE_L:                                      \
+                       asm volatile (lock #op "l %0, %1\n"             \
+                                     : "+r" (__ret), "+m" (*(ptr))     \
+                                     : : "memory", "cc");              \
+                       break;                                          \
+               case __X86_CASE_Q:                                      \
+                       asm volatile (lock #op "q %q0, %1\n"            \
+                                     : "+r" (__ret), "+m" (*(ptr))     \
+                                     : : "memory", "cc");              \
+                       break;                                          \
+               default:                                                \
+                       __ ## op ## _wrong_size();                      \
+               }                                                       \
+               __ret;                                                  \
+       })
+
 /*
  * Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
  * Since this is generally used to protect other memory information, we
  * use "asm volatile" and "memory" clobbers to prevent gcc from moving
  * information around.
  */
-#define __xchg(x, ptr, size)                                           \
-({                                                                     \
-       __typeof(*(ptr)) __x = (x);                                     \
-       switch (size) {                                                 \
-       case __X86_CASE_B:                                              \
-       {                                                               \
-               volatile u8 *__ptr = (volatile u8 *)(ptr);              \
-               asm volatile("xchgb %0,%1"                              \
-                            : "=q" (__x), "+m" (*__ptr)                \
-                            : "0" (__x)                                \
-                            : "memory");                               \
-               break;                                                  \
-       }                                                               \
-       case __X86_CASE_W:                                              \
-       {                                                               \
-               volatile u16 *__ptr = (volatile u16 *)(ptr);            \
-               asm volatile("xchgw %0,%1"                              \
-                            : "=r" (__x), "+m" (*__ptr)                \
-                            : "0" (__x)                                \
-                            : "memory");                               \
-               break;                                                  \
-       }                                                               \
-       case __X86_CASE_L:                                              \
-       {                                                               \
-               volatile u32 *__ptr = (volatile u32 *)(ptr);            \
-               asm volatile("xchgl %0,%1"                              \
-                            : "=r" (__x), "+m" (*__ptr)                \
-                            : "0" (__x)                                \
-                            : "memory");                               \
-               break;                                                  \
-       }                                                               \
-       case __X86_CASE_Q:                                              \
-       {                                                               \
-               volatile u64 *__ptr = (volatile u64 *)(ptr);            \
-               asm volatile("xchgq %0,%1"                              \
-                            : "=r" (__x), "+m" (*__ptr)                \
-                            : "0" (__x)                                \
-                            : "memory");                               \
-               break;                                                  \
-       }                                                               \
-       default:                                                        \
-               __xchg_wrong_size();                                    \
-       }                                                               \
-       __x;                                                            \
-})
-
-#define xchg(ptr, v)                                                   \
-       __xchg((v), (ptr), sizeof(*ptr))
+#define xchg(ptr, v)   __xchg_op((ptr), (v), xchg, "")
 
 /*
  * Atomic compare and exchange.  Compare OLD with MEM, if identical,
@@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void)
        __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
 #endif
 
-#define __xadd(ptr, inc, lock)                                         \
+/*
+ * xadd() adds "inc" to "*ptr" and atomically returns the previous
+ * value of "*ptr".
+ *
+ * xadd() is locked when multiple CPUs are online
+ * xadd_sync() is always locked
+ * xadd_local() is never locked
+ */
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc)         __xadd((ptr), (inc), LOCK_PREFIX)
+#define xadd_sync(ptr, inc)    __xadd((ptr), (inc), "lock; ")
+#define xadd_local(ptr, inc)   __xadd((ptr), (inc), "")
+
+#define __add(ptr, inc, lock)                                          \
        ({                                                              \
                __typeof__ (*(ptr)) __ret = (inc);                      \
                switch (sizeof(*(ptr))) {                               \
                case __X86_CASE_B:                                      \
-                       asm volatile (lock "xaddb %b0, %1\n"            \
-                                     : "+r" (__ret), "+m" (*(ptr))     \
-                                     : : "memory", "cc");              \
+                       asm volatile (lock "addb %b1, %0\n"             \
+                                     : "+m" (*(ptr)) : "ri" (inc)      \
+                                     : "memory", "cc");                \
                        break;                                          \
                case __X86_CASE_W:                                      \
-                       asm volatile (lock "xaddw %w0, %1\n"            \
-                                     : "+r" (__ret), "+m" (*(ptr))     \
-                                     : : "memory", "cc");              \
+                       asm volatile (lock "addw %w1, %0\n"             \
+                                     : "+m" (*(ptr)) : "ri" (inc)      \
+                                     : "memory", "cc");                \
                        break;                                          \
                case __X86_CASE_L:                                      \
-                       asm volatile (lock "xaddl %0, %1\n"             \
-                                     : "+r" (__ret), "+m" (*(ptr))     \
-                                     : : "memory", "cc");              \
+                       asm volatile (lock "addl %1, %0\n"              \
+                                     : "+m" (*(ptr)) : "ri" (inc)      \
+                                     : "memory", "cc");                \
                        break;                                          \
                case __X86_CASE_Q:                                      \
-                       asm volatile (lock "xaddq %q0, %1\n"            \
-                                     : "+r" (__ret), "+m" (*(ptr))     \
-                                     : : "memory", "cc");              \
+                       asm volatile (lock "addq %1, %0\n"              \
+                                     : "+m" (*(ptr)) : "ri" (inc)      \
+                                     : "memory", "cc");                \
                        break;                                          \
                default:                                                \
-                       __xadd_wrong_size();                            \
+                       __add_wrong_size();                             \
                }                                                       \
                __ret;                                                  \
        })
 
 /*
- * xadd() adds "inc" to "*ptr" and atomically returns the previous
- * value of "*ptr".
+ * add_*() adds "inc" to "*ptr"
  *
- * xadd() is locked when multiple CPUs are online
- * xadd_sync() is always locked
- * xadd_local() is never locked
+ * __add() takes a lock prefix
+ * add_smp() is locked when multiple CPUs are online
+ * add_sync() is always locked
  */
-#define xadd(ptr, inc)         __xadd((ptr), (inc), LOCK_PREFIX)
-#define xadd_sync(ptr, inc)    __xadd((ptr), (inc), "lock; ")
-#define xadd_local(ptr, inc)   __xadd((ptr), (inc), "")
+#define add_smp(ptr, inc)      __add((ptr), (inc), LOCK_PREFIX)
+#define add_sync(ptr, inc)     __add((ptr), (inc), "lock; ")
+
+#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2)                  \
+({                                                                     \
+       bool __ret;                                                     \
+       __typeof__(*(p1)) __old1 = (o1), __new1 = (n1);                 \
+       __typeof__(*(p2)) __old2 = (o2), __new2 = (n2);                 \
+       BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));                    \
+       BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));                    \
+       VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long)));            \
+       VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2));    \
+       asm volatile(pfx "cmpxchg%c4b %2; sete %0"                      \
+                    : "=a" (__ret), "+d" (__old2),                     \
+                      "+m" (*(p1)), "+m" (*(p2))                       \
+                    : "i" (2 * sizeof(long)), "a" (__old1),            \
+                      "b" (__new1), "c" (__new2));                     \
+       __ret;                                                          \
+})
+
+#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+       __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
+
+#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
+       __cmpxchg_double(, p1, p2, o1, o2, n1, n2)
 
 #endif /* ASM_X86_CMPXCHG_H */
index fbebb07..53f4b21 100644 (file)
@@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 
 #endif
 
-#define cmpxchg8b(ptr, o1, o2, n1, n2)                         \
-({                                                             \
-       char __ret;                                             \
-       __typeof__(o2) __dummy;                                 \
-       __typeof__(*(ptr)) __old1 = (o1);                       \
-       __typeof__(o2) __old2 = (o2);                           \
-       __typeof__(*(ptr)) __new1 = (n1);                       \
-       __typeof__(o2) __new2 = (n2);                           \
-       asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1"        \
-                      : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
-                      : "a" (__old1), "d"(__old2),             \
-                        "b" (__new1), "c" (__new2)             \
-                      : "memory");                             \
-       __ret; })
-
-
-#define cmpxchg8b_local(ptr, o1, o2, n1, n2)                   \
-({                                                             \
-       char __ret;                                             \
-       __typeof__(o2) __dummy;                                 \
-       __typeof__(*(ptr)) __old1 = (o1);                       \
-       __typeof__(o2) __old2 = (o2);                           \
-       __typeof__(*(ptr)) __new1 = (n1);                       \
-       __typeof__(o2) __new2 = (n2);                           \
-       asm volatile("cmpxchg8b %2; setz %1"                    \
-                      : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
-                      : "a" (__old), "d"(__old2),              \
-                        "b" (__new1), "c" (__new2),            \
-                      : "memory");                             \
-       __ret; })
-
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2)                            \
-({                                                                     \
-       BUILD_BUG_ON(sizeof(*(ptr)) != 4);                              \
-       VM_BUG_ON((unsigned long)(ptr) % 8);                            \
-       cmpxchg8b((ptr), (o1), (o2), (n1), (n2));                       \
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2)                      \
-({                                                                     \
-       BUILD_BUG_ON(sizeof(*(ptr)) != 4);                              \
-       VM_BUG_ON((unsigned long)(ptr) % 8);                            \
-       cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));                        \
-})
-
 #define system_has_cmpxchg_double() cpu_has_cx8
 
 #endif /* _ASM_X86_CMPXCHG_32_H */
index 285da02..614be87 100644 (file)
@@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
        cmpxchg_local((ptr), (o), (n));                                 \
 })
 
-#define cmpxchg16b(ptr, o1, o2, n1, n2)                                \
-({                                                             \
-       char __ret;                                             \
-       __typeof__(o2) __junk;                                  \
-       __typeof__(*(ptr)) __old1 = (o1);                       \
-       __typeof__(o2) __old2 = (o2);                           \
-       __typeof__(*(ptr)) __new1 = (n1);                       \
-       __typeof__(o2) __new2 = (n2);                           \
-       asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1"        \
-                      : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
-                      : "b"(__new1), "c"(__new2),              \
-                        "a"(__old1), "d"(__old2));             \
-       __ret; })
-
-
-#define cmpxchg16b_local(ptr, o1, o2, n1, n2)                  \
-({                                                             \
-       char __ret;                                             \
-       __typeof__(o2) __junk;                                  \
-       __typeof__(*(ptr)) __old1 = (o1);                       \
-       __typeof__(o2) __old2 = (o2);                           \
-       __typeof__(*(ptr)) __new1 = (n1);                       \
-       __typeof__(o2) __new2 = (n2);                           \
-       asm volatile("cmpxchg16b %2;setz %1"                    \
-                      : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
-                      : "b"(__new1), "c"(__new2),              \
-                        "a"(__old1), "d"(__old2));             \
-       __ret; })
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2)                            \
-({                                                                     \
-       BUILD_BUG_ON(sizeof(*(ptr)) != 8);                              \
-       VM_BUG_ON((unsigned long)(ptr) % 16);                           \
-       cmpxchg16b((ptr), (o1), (o2), (n1), (n2));                      \
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2)                      \
-({                                                                     \
-       BUILD_BUG_ON(sizeof(*(ptr)) != 8);                              \
-       VM_BUG_ON((unsigned long)(ptr) % 16);                           \
-       cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));                \
-})
-
 #define system_has_cmpxchg_double() cpu_has_cx16
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
index 9a2d644..ced283a 100644 (file)
@@ -4,6 +4,7 @@
 #ifdef CONFIG_X86_32
 
 #include <linux/types.h>
+#include <linux/log2.h>
 
 /*
  * do_div() is NOT a C function. It wants to return
 ({                                                             \
        unsigned long __upper, __low, __high, __mod, __base;    \
        __base = (base);                                        \
-       asm("":"=a" (__low), "=d" (__high) : "A" (n));          \
-       __upper = __high;                                       \
-       if (__high) {                                           \
-               __upper = __high % (__base);                    \
-               __high = __high / (__base);                     \
+       if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
+               __mod = n & (__base - 1);                       \
+               n >>= ilog2(__base);                            \
+       } else {                                                \
+               asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
+               __upper = __high;                               \
+               if (__high) {                                   \
+                       __upper = __high % (__base);            \
+                       __high = __high / (__base);             \
+               }                                               \
+               asm("divl %2" : "=a" (__low), "=d" (__mod)      \
+                       : "rm" (__base), "0" (__low), "1" (__upper));   \
+               asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
        }                                                       \
-       asm("divl %2":"=a" (__low), "=d" (__mod)                \
-           : "rm" (__base), "0" (__low), "1" (__upper));       \
-       asm("":"=A" (n) : "a" (__low), "d" (__high));           \
        __mod;                                                  \
 })
 
index 908b969..3778256 100644 (file)
@@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end)
 
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+extern u64 early_reserve_e820(u64 sizet, u64 align);
 
 void memblock_x86_fill(void);
 void memblock_find_dma_reserve(void);
index 55e4de6..da0b3ca 100644 (file)
@@ -11,6 +11,7 @@ typedef struct {
 #ifdef CONFIG_X86_LOCAL_APIC
        unsigned int apic_timer_irqs;   /* arch dependent */
        unsigned int irq_spurious_count;
+       unsigned int icr_read_retry_count;
 #endif
        unsigned int x86_platform_ipis; /* arch dependent */
        unsigned int apic_perf_irqs;
index c9e09ea..6919e93 100644 (file)
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
 #ifdef CONFIG_SMP
 #define safe_address (__per_cpu_offset[0])
 #else
-#define safe_address (kstat_cpu(0).cpustat.user)
+#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
 #endif
 
 /*
index 88c765e..74df3f1 100644 (file)
@@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn)
        return (insn->vex_prefix.value != 0);
 }
 
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+       return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+               insn->displacement.got && insn->immediate.got;
+}
+
 static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
 {
        if (insn->vex_prefix.nbytes == 2)       /* 2 bytes VEX */
index 8537285..88d0c3c 100644 (file)
@@ -15,7 +15,7 @@
 
 #define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
 #define CALIBRATE_LATCH        \
-       ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+       ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
 
 static inline void mach_prepare_counter(void)
 {
index 01fdf56..0e8e85b 100644 (file)
@@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
 #define lock_cmos_suffix(reg) do {} while (0)
-#define lock_cmos(reg)
-#define unlock_cmos()
+#define lock_cmos(reg) do { } while (0)
+#define unlock_cmos() do { } while (0)
 #define do_i_have_lock_cmos() 0
 #define current_lock_cmos_reg() 0
 #endif
index 0e8ae57..6add827 100644 (file)
 #define MCJ_CTX_MASK           3
 #define MCJ_CTX(flags)         ((flags) & MCJ_CTX_MASK)
 #define MCJ_CTX_RANDOM         0    /* inject context: random */
-#define MCJ_CTX_PROCESS                1    /* inject context: process */
-#define MCJ_CTX_IRQ            2    /* inject context: IRQ */
-#define MCJ_NMI_BROADCAST      4    /* do NMI broadcasting */
-#define MCJ_EXCEPTION          8    /* raise as exception */
+#define MCJ_CTX_PROCESS                0x1  /* inject context: process */
+#define MCJ_CTX_IRQ            0x2  /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST      0x4  /* do NMI broadcasting */
+#define MCJ_EXCEPTION          0x8  /* raise as exception */
+#define MCJ_IRQ_BRAODCAST      0x10 /* do IRQ broadcasting */
 
 /* Fields are zero when not available */
 struct mce {
@@ -120,7 +121,8 @@ struct mce_log {
 
 #ifdef __KERNEL__
 
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern void mce_register_decode_chain(struct notifier_block *nb);
+extern void mce_unregister_decode_chain(struct notifier_block *nb);
 
 #include <linux/percpu.h>
 #include <linux/init.h>
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
deleted file mode 100644 (file)
index 0cd3800..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _X86_MEMBLOCK_H
-#define _X86_MEMBLOCK_H
-
-#define ARCH_DISCARD_MEMBLOCK
-
-u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
-
-void memblock_x86_reserve_range(u64 start, u64 end, char *name);
-void memblock_x86_free_range(u64 start, u64 end);
-struct range;
-int __get_free_all_memory_range(struct range **range, int nodeid,
-                        unsigned long start_pfn, unsigned long end_pfn);
-int get_free_all_memory_range(struct range **rangep, int nodeid);
-
-void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
-                                        unsigned long last_pfn);
-u64 memblock_x86_hole_size(u64 start, u64 end);
-u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
-u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
-u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
-bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
-
-#endif
index 2421507..4ebe157 100644 (file)
@@ -48,6 +48,7 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
 
 #ifdef CONFIG_MICROCODE_AMD
 extern struct microcode_ops * __init init_amd_microcode(void);
+extern void __exit exit_amd_microcode(void);
 
 static inline void get_ucode_data(void *to, const u8 *from, size_t n)
 {
@@ -59,6 +60,7 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
 {
        return NULL;
 }
+static inline void __exit exit_amd_microcode(void) {}
 #endif
 
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644 (file)
index 0000000..660f843
--- /dev/null
@@ -0,0 +1,167 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific Header file
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <linux/io.h>
+#include <linux/swab.h>
+#include <asm/types.h>
+#include <asm/processor.h>
+
+#define CSR_NODE_SHIFT         16
+#define CSR_NODE_BITS(p)       (((unsigned long)(p)) << CSR_NODE_SHIFT)
+#define CSR_NODE_MASK          0x0fff          /* 4K nodes */
+
+/* 32K CSR space, b15 indicates geo/non-geo */
+#define CSR_OFFSET_MASK        0x7fffUL
+
+/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
+#define NUMACHIP_GCSR_BASE     0x3fff00000000ULL
+#define NUMACHIP_GCSR_LIM      0x3fff0fffffffULL
+#define NUMACHIP_GCSR_SIZE     (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
+
+/*
+ * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
+ * when using the direct mapping on x86_64, both start and size needs to be
+ * aligned with PMD_SIZE which is 2M
+ */
+#define NUMACHIP_LCSR_BASE     0x3ffffe000000ULL
+#define NUMACHIP_LCSR_LIM      0x3fffffffffffULL
+#define NUMACHIP_LCSR_SIZE     (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
+
+static inline void *gcsr_address(int node, unsigned long offset)
+{
+       return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
+               CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline void *lcsr_address(unsigned long offset)
+{
+       return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
+               CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline unsigned int read_gcsr(int node, unsigned long offset)
+{
+       return swab32(readl(gcsr_address(node, offset)));
+}
+
+static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
+{
+       writel(swab32(val), gcsr_address(node, offset));
+}
+
+static inline unsigned int read_lcsr(unsigned long offset)
+{
+       return swab32(readl(lcsr_address(offset)));
+}
+
+static inline void write_lcsr(unsigned long offset, unsigned int val)
+{
+       writel(swab32(val), lcsr_address(offset));
+}
+
+/* ========================================================================= */
+/*                   CSR_G0_STATE_CLEAR                                      */
+/* ========================================================================= */
+
+#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
+union numachip_csr_g0_state_clear {
+       unsigned int v;
+       struct numachip_csr_g0_state_clear_s {
+               unsigned int _state:2;
+               unsigned int _rsvd_2_6:5;
+               unsigned int _lost:1;
+               unsigned int _rsvd_8_31:24;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G0_NODE_IDS                                         */
+/* ========================================================================= */
+
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+union numachip_csr_g0_node_ids {
+       unsigned int v;
+       struct numachip_csr_g0_node_ids_s {
+               unsigned int _initialid:16;
+               unsigned int _nodeid:12;
+               unsigned int _rsvd_28_31:4;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G3_EXT_IRQ_GEN                                      */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
+union numachip_csr_g3_ext_irq_gen {
+       unsigned int v;
+       struct numachip_csr_g3_ext_irq_gen_s {
+               unsigned int _vector:8;
+               unsigned int _msgtype:3;
+               unsigned int _index:5;
+               unsigned int _destination_apic_id:16;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G3_EXT_IRQ_STATUS                                   */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
+union numachip_csr_g3_ext_irq_status {
+       unsigned int v;
+       struct numachip_csr_g3_ext_irq_status_s {
+               unsigned int _result:32;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G3_EXT_IRQ_DEST                                     */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
+union numachip_csr_g3_ext_irq_dest {
+       unsigned int v;
+       struct numachip_csr_g3_ext_irq_dest_s {
+               unsigned int _irq:8;
+               unsigned int _rsvd_8_31:24;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G3_NC_ATT_MAP_SELECT                                */
+/* ========================================================================= */
+
+#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
+union numachip_csr_g3_nc_att_map_select {
+       unsigned int v;
+       struct numachip_csr_g3_nc_att_map_select_s {
+               unsigned int _upper_address_bits:4;
+               unsigned int _select_ram:4;
+               unsigned int _rsvd_8_31:24;
+       } s;
+};
+
+/* ========================================================================= */
+/*                   CSR_G3_NC_ATT_MAP_SELECT_0-255                          */
+/* ========================================================================= */
+
+#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
+
index 3470c9d..529bf07 100644 (file)
@@ -451,23 +451,20 @@ do {                                                                      \
 #endif /* !CONFIG_M386 */
 
 #ifdef CONFIG_X86_CMPXCHG64
-#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)                  \
+#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2)            \
 ({                                                                     \
-       char __ret;                                                     \
-       typeof(o1) __o1 = o1;                                           \
-       typeof(o1) __n1 = n1;                                           \
-       typeof(o2) __o2 = o2;                                           \
-       typeof(o2) __n2 = n2;                                           \
-       typeof(o2) __dummy = n2;                                        \
+       bool __ret;                                                     \
+       typeof(pcp1) __o1 = (o1), __n1 = (n1);                          \
+       typeof(pcp2) __o2 = (o2), __n2 = (n2);                          \
        asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t"       \
-                   : "=a"(__ret), "=m" (pcp1), "=d"(__dummy)           \
-                   :  "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2));     \
+                   : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
+                   :  "b" (__n1), "c" (__n2), "a" (__o1));             \
        __ret;                                                          \
 })
 
-#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)                percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)          percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)       percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_4    percpu_cmpxchg8b_double
+#define this_cpu_cmpxchg_double_4      percpu_cmpxchg8b_double
+#define irqsafe_cpu_cmpxchg_double_4   percpu_cmpxchg8b_double
 #endif /* CONFIG_X86_CMPXCHG64 */
 
 /*
@@ -508,31 +505,23 @@ do {                                                                      \
  * it in software.  The address used in the cmpxchg16 instruction must be
  * aligned to a 16 byte boundary.
  */
-#ifdef CONFIG_SMP
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
-#else
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
-#endif
-#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)                 \
+#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2)           \
 ({                                                                     \
-       char __ret;                                                     \
-       typeof(o1) __o1 = o1;                                           \
-       typeof(o1) __n1 = n1;                                           \
-       typeof(o2) __o2 = o2;                                           \
-       typeof(o2) __n2 = n2;                                           \
-       typeof(o2) __dummy;                                             \
-       alternative_io(CMPXCHG16B_EMU_CALL,                             \
-                      "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t",  \
+       bool __ret;                                                     \
+       typeof(pcp1) __o1 = (o1), __n1 = (n1);                          \
+       typeof(pcp2) __o2 = (o2), __n2 = (n2);                          \
+       alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
+                      "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
                       X86_FEATURE_CX16,                                \
-                      ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),         \
-                      "S" (&pcp1), "b"(__n1), "c"(__n2),               \
-                      "a"(__o1), "d"(__o2) : "memory");                \
+                      ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),           \
+                                  "+m" (pcp2), "+d" (__o2)),           \
+                      "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");     \
        __ret;                                                          \
 })
 
-#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)                percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)          percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)       percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_8    percpu_cmpxchg16b_double
+#define this_cpu_cmpxchg_double_8      percpu_cmpxchg16b_double
+#define irqsafe_cpu_cmpxchg_double_8   percpu_cmpxchg16b_double
 
 #endif
 
index f61c62f..096c975 100644 (file)
@@ -57,6 +57,7 @@
                (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED             6
+#define ARCH_PERFMON_EVENTS_COUNT                      7
 
 /*
  * Intel "Architectural Performance Monitoring" CPUID
@@ -72,6 +73,19 @@ union cpuid10_eax {
        unsigned int full;
 };
 
+union cpuid10_ebx {
+       struct {
+               unsigned int no_unhalted_core_cycles:1;
+               unsigned int no_instructions_retired:1;
+               unsigned int no_unhalted_reference_cycles:1;
+               unsigned int no_llc_reference:1;
+               unsigned int no_llc_misses:1;
+               unsigned int no_branch_instruction_retired:1;
+               unsigned int no_branch_misses_retired:1;
+       } split;
+       unsigned int full;
+};
+
 union cpuid10_edx {
        struct {
                unsigned int num_counters_fixed:5;
@@ -81,6 +95,15 @@ union cpuid10_edx {
        unsigned int full;
 };
 
+struct x86_pmu_capability {
+       int             version;
+       int             num_counters_gp;
+       int             num_counters_fixed;
+       int             bit_width_gp;
+       int             bit_width_fixed;
+       unsigned int    events_mask;
+       int             events_mask_len;
+};
 
 /*
  * Fixed-purpose performance events:
@@ -89,23 +112,24 @@ union cpuid10_edx {
 /*
  * All 3 fixed-mode PMCs are configured via this single MSR:
  */
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL                        0x38d
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL        0x38d
 
 /*
  * The counts are available in three separate MSRs:
  */
 
 /* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0                    0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS                 (X86_PMC_IDX_FIXED + 0)
+#define MSR_ARCH_PERFMON_FIXED_CTR0    0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1                    0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES                   (X86_PMC_IDX_FIXED + 1)
+#define MSR_ARCH_PERFMON_FIXED_CTR1    0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES   (X86_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
+#define MSR_ARCH_PERFMON_FIXED_CTR2    0x30b
+#define X86_PMC_IDX_FIXED_REF_CYCLES   (X86_PMC_IDX_FIXED + 2)
+#define X86_PMC_MSK_FIXED_REF_CYCLES   (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES)
 
 /*
  * We model BTS tracing as another fixed-mode PMC.
@@ -202,6 +226,7 @@ struct perf_guest_switch_msr {
 };
 
 extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
 #else
 static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
 {
@@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
        return NULL;
 }
 
+static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+       memset(cap, 0, sizeof(*cap));
+}
+
 static inline void perf_events_lapic_init(void)        { }
 #endif
 
index 18601c8..49afb3f 100644 (file)
@@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
        pte_update(mm, addr, ptep);
 }
 
-#define flush_tlb_fix_spurious_fault(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
 
 #define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
 
index 2dddb31..f8ab3ea 100644 (file)
@@ -6,6 +6,7 @@
  * EFLAGS bits
  */
 #define X86_EFLAGS_CF  0x00000001 /* Carry Flag */
+#define X86_EFLAGS_BIT1        0x00000002 /* Bit 1 - always on */
 #define X86_EFLAGS_PF  0x00000004 /* Parity Flag */
 #define X86_EFLAGS_AF  0x00000010 /* Auxiliary carry Flag */
 #define X86_EFLAGS_ZF  0x00000040 /* Zero Flag */
index b650435..aa9088c 100644 (file)
@@ -99,7 +99,6 @@ struct cpuinfo_x86 {
        u16                     apicid;
        u16                     initial_apicid;
        u16                     x86_clflush_size;
-#ifdef CONFIG_SMP
        /* number of cores as seen by the OS: */
        u16                     booted_cores;
        /* Physical processor id: */
@@ -110,7 +109,6 @@ struct cpuinfo_x86 {
        u8                      compute_unit_id;
        /* Index into per_cpu list: */
        u16                     cpu_index;
-#endif
        u32                     microcode;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
index 972c260..a82c2bf 100644 (file)
@@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
        return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-#if (NR_CPUS < 256)
 static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 {
-       asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
-                    : "+m" (lock->head_tail)
-                    :
-                    : "memory", "cc");
+       __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
 }
-#else
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
-       asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
-                    : "+m" (lock->head_tail)
-                    :
-                    : "memory", "cc");
-}
-#endif
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
index a1fe5c1..185b719 100644 (file)
@@ -40,7 +40,8 @@ struct thread_info {
                                                */
        __u8                    supervisor_stack[0];
 #endif
-       int                     uaccess_err;
+       int                     sig_on_uaccess_error:1;
+       int                     uaccess_err:1;  /* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)                  \
@@ -231,6 +232,12 @@ static inline struct thread_info *current_thread_info(void)
        movq PER_CPU_VAR(kernel_stack),reg ; \
        subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
 
+/*
+ * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
+ * a certain register (to be used in assembler memory operands).
+ */
+#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+
 #endif
 
 #endif /* !X86_32 */
index c006924..800f77c 100644 (file)
@@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void);
        .balance_interval       = 1,                                    \
 }
 
-#ifdef CONFIG_X86_64
 extern int __node_distance(int, int);
 #define node_distance(a, b) __node_distance(a, b)
-#endif
 
 #else /* !CONFIG_NUMA */
 
index 83e2efd..15d9915 100644 (file)
@@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern unsigned long native_calibrate_tsc(void);
 
+extern int tsc_clocksource_reliable;
+
 /*
  * Boot-time check whether the TSCs are synchronized across
  * all CPUs/cores:
index 36361bf..8be5f54 100644 (file)
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
        barrier();
 
 #define uaccess_catch(err)                                             \
-       (err) |= current_thread_info()->uaccess_err;                    \
+       (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);    \
        current_thread_info()->uaccess_err = prev_err;                  \
 } while (0)
 
index 1971e65..1ac860a 100644 (file)
@@ -7,6 +7,7 @@
 struct mpc_bus;
 struct mpc_cpu;
 struct mpc_table;
+struct cpuinfo_x86;
 
 /**
  * struct x86_init_mpparse - platform specific mpparse ops
@@ -147,6 +148,7 @@ struct x86_init_ops {
  */
 struct x86_cpuinit_ops {
        void (*setup_percpu_clockev)(void);
+       void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
 };
 
 /**
@@ -186,5 +188,6 @@ extern struct x86_msi_ops x86_msi;
 
 extern void x86_init_noop(void);
 extern void x86_init_uint_noop(unsigned int unused);
+extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node);
 
 #endif
index 4558f0d..ce664f3 100644 (file)
@@ -219,6 +219,8 @@ static int __init
 acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 {
        struct acpi_madt_local_x2apic *processor = NULL;
+       int apic_id;
+       u8 enabled;
 
        processor = (struct acpi_madt_local_x2apic *)header;
 
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
 
        acpi_table_print_madt_entry(header);
 
+       apic_id = processor->local_apic_id;
+       enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
 #ifdef CONFIG_X86_X2APIC
        /*
         * We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
         * to not preallocating memory for all NR_CPUS
         * when we use CPU hotplug.
         */
-       acpi_register_lapic(processor->local_apic_id,   /* APIC ID */
-                           processor->lapic_flags & ACPI_MADT_ENABLED);
+       if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+               printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+       else
+               acpi_register_lapic(apic_id, enabled);
 #else
        printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
index 4c39baa..013c181 100644 (file)
@@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu)
 {
        struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
        unsigned int mask;
-       int cuid = 0;
+       int cuid;
 
        if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
                return 0;
 
        pci_read_config_dword(link, 0x1d4, &mask);
 
-#ifdef CONFIG_SMP
        cuid = cpu_data(cpu).compute_unit_id;
-#endif
        return (mask >> (4 * cuid)) & 0xf;
 }
 
@@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask)
        static unsigned int reset, ban;
        struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
        unsigned int reg;
-       int cuid = 0;
+       int cuid;
 
        if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
                return -EINVAL;
@@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask)
                pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
        }
 
-#ifdef CONFIG_SMP
        cuid = cpu_data(cpu).compute_unit_id;
-#endif
        mask <<= 4 * cuid;
        mask |= (0xf ^ (1 << cuid)) << 26;
 
index 3d2661c..6e76c19 100644 (file)
@@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)
         */
        addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
                                      aper_size, aper_size);
-       if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
+       if (!addr || addr + aper_size > GART_MAX_ADDR) {
                printk(KERN_ERR
                        "Cannot allocate aperture memory hole (%lx,%uK)\n",
                                addr, aper_size>>10);
                return 0;
        }
-       memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
+       memblock_reserve(addr, aper_size);
        /*
         * Kmemleak should not scan this block as it may not be mapped via the
         * kernel direct mapping.
index 767fd04..0ae0323 100644 (file)
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP)             += ipi.o
 
 ifeq ($(CONFIG_X86_64),y)
 # APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP)     += apic_numachip.o
 obj-$(CONFIG_X86_UV)           += x2apic_uv_x.o
 obj-$(CONFIG_X86_X2APIC)       += x2apic_phys.o
 obj-$(CONFIG_X86_X2APIC)       += x2apic_cluster.o
index f98d84c..2eec05b 100644 (file)
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
 int x2apic_mode;
 #ifdef CONFIG_X86_X2APIC
 /* x2apic enabled before OS handover */
-static int x2apic_preenabled;
+int x2apic_preenabled;
+static int x2apic_disabled;
+static int nox2apic;
 static __init int setup_nox2apic(char *str)
 {
        if (x2apic_enabled()) {
-               pr_warning("Bios already enabled x2apic, "
-                          "can't enforce nox2apic");
-               return 0;
-       }
+               int apicid = native_apic_msr_read(APIC_ID);
+
+               if (apicid >= 255) {
+                       pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+                                  apicid);
+                       return 0;
+               }
+
+               pr_warning("x2apic already enabled. will disable it\n");
+       } else
+               setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+
+       nox2apic = 1;
 
-       setup_clear_cpu_cap(X86_FEATURE_X2APIC);
        return 0;
 }
 early_param("nox2apic", setup_nox2apic);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
                send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
                if (!send_status)
                        break;
+               inc_irq_stat(icr_read_retry_count);
                udelay(100);
        } while (timeout++ < 1000);
 
@@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
         * Besides, if we don't timer interrupts ignore the global
         * interrupt lock, which is the WrongThing (tm) to do.
         */
-       exit_idle();
        irq_enter();
+       exit_idle();
        local_apic_timer_interrupt();
        irq_exit();
 
@@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
 }
 
 #ifdef CONFIG_X86_X2APIC
+/*
+ * Need to disable xapic and x2apic at the same time and then enable xapic mode
+ */
+static inline void __disable_x2apic(u64 msr)
+{
+       wrmsrl(MSR_IA32_APICBASE,
+              msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+       wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+}
+
+static __init void disable_x2apic(void)
+{
+       u64 msr;
+
+       if (!cpu_has_x2apic)
+               return;
+
+       rdmsrl(MSR_IA32_APICBASE, msr);
+       if (msr & X2APIC_ENABLE) {
+               u32 x2apic_id = read_apic_id();
+
+               if (x2apic_id >= 255)
+                       panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+               pr_info("Disabling x2apic\n");
+               __disable_x2apic(msr);
+
+               if (nox2apic) {
+                       clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
+                       setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+               }
+
+               x2apic_disabled = 1;
+               x2apic_mode = 0;
+
+               register_lapic_address(mp_lapic_addr);
+       }
+}
+
 void check_x2apic(void)
 {
        if (x2apic_enabled()) {
@@ -1441,15 +1491,20 @@ void check_x2apic(void)
 
 void enable_x2apic(void)
 {
-       int msr, msr2;
+       u64 msr;
+
+       rdmsrl(MSR_IA32_APICBASE, msr);
+       if (x2apic_disabled) {
+               __disable_x2apic(msr);
+               return;
+       }
 
        if (!x2apic_mode)
                return;
 
-       rdmsr(MSR_IA32_APICBASE, msr, msr2);
        if (!(msr & X2APIC_ENABLE)) {
                printk_once(KERN_INFO "Enabling x2apic\n");
-               wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
+               wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
        }
 }
 #endif /* CONFIG_X86_X2APIC */
@@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)
        ret = save_ioapic_entries();
        if (ret) {
                pr_info("Saving IO-APIC state failed: %d\n", ret);
-               goto out;
+               return;
        }
 
        local_irq_save(flags);
        legacy_pic->mask_all();
        mask_ioapic_entries();
 
+       if (x2apic_preenabled && nox2apic)
+               disable_x2apic();
+
        if (dmar_table_init_ret)
                ret = -1;
        else
                ret = enable_IR();
 
+       if (!x2apic_supported())
+               goto skip_x2apic;
+
        if (ret < 0) {
                /* IR is required if there is APIC ID > 255 even when running
                 * under KVM
                 */
                if (max_physical_apicid > 255 ||
-                   !hypervisor_x2apic_available())
-                       goto nox2apic;
+                   !hypervisor_x2apic_available()) {
+                       if (x2apic_preenabled)
+                               disable_x2apic();
+                       goto skip_x2apic;
+               }
                /*
                 * without IR all CPUs can be addressed by IOAPIC/MSI
                 * only in physical mode
@@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)
                x2apic_force_phys();
        }
 
-       if (ret == IRQ_REMAP_XAPIC_MODE)
-               goto nox2apic;
+       if (ret == IRQ_REMAP_XAPIC_MODE) {
+               pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
+               goto skip_x2apic;
+       }
 
        x2apic_enabled = 1;
 
@@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)
                pr_info("Enabled x2apic\n");
        }
 
-nox2apic:
+skip_x2apic:
        if (ret < 0) /* IR enabling failed */
                restore_ioapic_entries();
        legacy_pic->restore_mask();
        local_irq_restore(flags);
-
-out:
-       if (x2apic_enabled || !x2apic_supported())
-               return;
-
-       if (x2apic_preenabled)
-               panic("x2apic: enabled by BIOS but kernel init failed.");
-       else if (ret == IRQ_REMAP_XAPIC_MODE)
-               pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
-       else if (ret < 0)
-               pr_info("x2apic not enabled, IRQ remapping init failed\n");
 }
 
 #ifdef CONFIG_X86_64
@@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 {
        u32 v;
 
-       exit_idle();
        irq_enter();
+       exit_idle();
        /*
         * Check if this really is a spurious interrupt and ACK it
         * if it is a vectored one.  Just in case...
@@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
                "Illegal register address",     /* APIC Error Bit 7 */
        };
 
-       exit_idle();
        irq_enter();
+       exit_idle();
        /* First tickle the hardware, only then report what went on. -- REW */
        v0 = apic_read(APIC_ESR);
        apic_write(APIC_ESR, 0);
index f7a41e4..8c3cdde 100644 (file)
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
  * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
  * document number 292116).  So here it goes...
  */
-static void flat_init_apic_ldr(void)
+void flat_init_apic_ldr(void)
 {
        unsigned long val;
        unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
        return initial_apic_id >> index_msb;
 }
 
+static int flat_probe(void)
+{
+       return 1;
+}
+
 static struct apic apic_flat =  {
        .name                           = "flat",
-       .probe                          = NULL,
+       .probe                          = flat_probe,
        .acpi_madt_oem_check            = flat_acpi_madt_oem_check,
        .apic_id_registered             = flat_apic_id_registered,
 
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644 (file)
index 0000000..09d3d8c
--- /dev/null
@@ -0,0 +1,294 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+
+#include <asm/numachip/numachip_csr.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+
+static int numachip_system __read_mostly;
+
+static struct apic apic_numachip __read_mostly;
+
+static unsigned int get_apic_id(unsigned long x)
+{
+       unsigned long value;
+       unsigned int id;
+
+       rdmsrl(MSR_FAM10H_NODE_ID, value);
+       id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+
+       return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+       unsigned long x;
+
+       x = ((id & 0xffU) << 24);
+       return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+       return get_apic_id(apic_read(APIC_ID));
+}
+
+static int numachip_apic_id_registered(void)
+{
+       return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+       return initial_apic_id >> index_msb;
+}
+
+static const struct cpumask *numachip_target_cpus(void)
+{
+       return cpu_online_mask;
+}
+
+static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
+}
+
+static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+       union numachip_csr_g3_ext_irq_gen int_gen;
+
+       int_gen.s._destination_apic_id = phys_apicid;
+       int_gen.s._vector = 0;
+       int_gen.s._msgtype = APIC_DM_INIT >> 8;
+       int_gen.s._index = 0;
+
+       write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+       int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
+       int_gen.s._vector = start_rip >> 12;
+
+       write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+       atomic_set(&init_deasserted, 1);
+       return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+       union numachip_csr_g3_ext_irq_gen int_gen;
+       int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
+       int_gen.s._destination_apic_id = apicid;
+       int_gen.s._vector = vector;
+       int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
+       int_gen.s._index = 0;
+
+       write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+       unsigned int cpu;
+
+       for_each_cpu(cpu, mask)
+               numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                               int vector)
+{
+       unsigned int this_cpu = smp_processor_id();
+       unsigned int cpu;
+
+       for_each_cpu(cpu, mask) {
+               if (cpu != this_cpu)
+                       numachip_send_IPI_one(cpu, vector);
+       }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+       unsigned int this_cpu = smp_processor_id();
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               if (cpu != this_cpu)
+                       numachip_send_IPI_one(cpu, vector);
+       }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+       numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+       __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_first(cpumask);
+       if (likely((unsigned)cpu < nr_cpu_ids))
+               return per_cpu(x86_cpu_to_apicid, cpu);
+
+       return BAD_APICID;
+}
+
+static unsigned int
+numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                               const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask) {
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       }
+       return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static int __init numachip_probe(void)
+{
+       return apic == &apic_numachip;
+}
+
+static void __init map_csrs(void)
+{
+       printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
+               NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
+       init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+
+       printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
+               NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
+       init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+       c->phys_proc_id = node;
+       per_cpu(cpu_llc_id, smp_processor_id()) = node;
+}
+
+static int __init numachip_system_init(void)
+{
+       unsigned int val;
+
+       if (!numachip_system)
+               return 0;
+
+       x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+
+       map_csrs();
+
+       val = read_lcsr(CSR_G0_NODE_IDS);
+       printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
+
+       return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+       if (!strncmp(oem_id, "NUMASC", 6)) {
+               numachip_system = 1;
+               return 1;
+       }
+
+       return 0;
+}
+
+static struct apic apic_numachip __refconst = {
+
+       .name                           = "NumaConnect system",
+       .probe                          = numachip_probe,
+       .acpi_madt_oem_check            = numachip_acpi_madt_oem_check,
+       .apic_id_registered             = numachip_apic_id_registered,
+
+       .irq_delivery_mode              = dest_Fixed,
+       .irq_dest_mode                  = 0, /* physical */
+
+       .target_cpus                    = numachip_target_cpus,
+       .disable_esr                    = 0,
+       .dest_logical                   = 0,
+       .check_apicid_used              = NULL,
+       .check_apicid_present           = NULL,
+
+       .vector_allocation_domain       = numachip_vector_allocation_domain,
+       .init_apic_ldr                  = flat_init_apic_ldr,
+
+       .ioapic_phys_id_map             = NULL,
+       .setup_apic_routing             = NULL,
+       .multi_timer_check              = NULL,
+       .cpu_present_to_apicid          = default_cpu_present_to_apicid,
+       .apicid_to_cpu_present          = NULL,
+       .setup_portio_remap             = NULL,
+       .check_phys_apicid_present      = default_check_phys_apicid_present,
+       .enable_apic_mode               = NULL,
+       .phys_pkg_id                    = numachip_phys_pkg_id,
+       .mps_oem_check                  = NULL,
+
+       .get_apic_id                    = get_apic_id,
+       .set_apic_id                    = set_apic_id,
+       .apic_id_mask                   = 0xffU << 24,
+
+       .cpu_mask_to_apicid             = numachip_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and         = numachip_cpu_mask_to_apicid_and,
+
+       .send_IPI_mask                  = numachip_send_IPI_mask,
+       .send_IPI_mask_allbutself       = numachip_send_IPI_mask_allbutself,
+       .send_IPI_allbutself            = numachip_send_IPI_allbutself,
+       .send_IPI_all                   = numachip_send_IPI_all,
+       .send_IPI_self                  = numachip_send_IPI_self,
+
+       .wakeup_secondary_cpu           = numachip_wakeup_secondary,
+       .trampoline_phys_low            = DEFAULT_TRAMPOLINE_PHYS_LOW,
+       .trampoline_phys_high           = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+       .wait_for_init_deassert         = NULL,
+       .smp_callin_clear_local_apic    = NULL,
+       .inquire_remote_apic            = NULL, /* REMRD not supported */
+
+       .read                           = native_apic_mem_read,
+       .write                          = native_apic_mem_write,
+       .icr_read                       = native_apic_icr_read,
+       .icr_write                      = native_apic_icr_write,
+       .wait_icr_idle                  = native_apic_wait_icr_idle,
+       .safe_wait_icr_idle             = native_safe_apic_wait_icr_idle,
+};
+apic_driver(apic_numachip);
+
index 6d939d7..fb07275 100644 (file)
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
        unsigned vector, me;
 
        ack_APIC_irq();
-       exit_idle();
        irq_enter();
+       exit_idle();
 
        me = smp_processor_id();
        for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)
        }
        local_irq_disable();
        apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+       if (x2apic_preenabled)
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "Perhaps problem with the pre-enabled x2apic mode\n"
+                           "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
        panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
                "report.  Then try booting with the 'noapic' option.\n");
 out:
index 452932d..5da1269 100644 (file)
@@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
 
 void __init setup_bios_corruption_check(void)
 {
-       u64 addr = PAGE_SIZE;   /* assume first page is reserved anyway */
+       phys_addr_t start, end;
+       u64 i;
 
        if (memory_corruption_check == -1) {
                memory_corruption_check =
@@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)
 
        corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
 
-       while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
-               u64 size;
-               addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
+       for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+               start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+                               PAGE_SIZE, corruption_check_size);
+               end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+                             PAGE_SIZE, corruption_check_size);
+               if (start >= end)
+                       continue;
 
-               if (addr == MEMBLOCK_ERROR)
-                       break;
-
-               if (addr >= corruption_check_size)
-                       break;
-
-               if ((addr + size) > corruption_check_size)
-                       size = corruption_check_size - addr;
-
-               memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
-               scan_areas[num_scan_areas].addr = addr;
-               scan_areas[num_scan_areas].size = size;
-               num_scan_areas++;
+               memblock_reserve(start, end - start);
+               scan_areas[num_scan_areas].addr = start;
+               scan_areas[num_scan_areas].size = end - start;
 
                /* Assume we've already mapped this early memory */
-               memset(__va(addr), 0, size);
+               memset(__va(start), 0, end - start);
 
-               addr += size;
+               if (++num_scan_areas >= MAX_SCAN_AREAS)
+                       break;
        }
 
        if (num_scan_areas)
index 0bab2b1..f4773f4 100644 (file)
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 
 static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
        /* calling is from identify_secondary_cpu() ? */
        if (!c->cpu_index)
                return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
 
 valid_k7:
        ;
-#endif
 }
 
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
        if (node == NUMA_NO_NODE)
                node = per_cpu(cpu_llc_id, cpu);
 
+       /*
+        * If core numbers are inconsistent, it's likely a multi-fabric platform,
+        * so invoke platform-specific handler
+        */
+       if (c->phys_proc_id != node)
+               x86_cpuinit.fixup_cpu_id(c, node);
+
        if (!node_online(node)) {
                /*
                 * Two possibilities here:
index e58d978..159103c 100644 (file)
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
        }
 #ifdef CONFIG_X86_32
        /* Cyrix III family needs CX8 & PGE explicitly enabled. */
-       if (c->x86_model >= 6 && c->x86_model <= 9) {
+       if (c->x86_model >= 6 && c->x86_model <= 13) {
                rdmsr(MSR_VIA_FCR, lo, hi);
                lo |= (1<<1 | 1<<7);
                wrmsr(MSR_VIA_FCR, lo, hi);
index aa003b1..850f296 100644 (file)
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
        if (this_cpu->c_early_init)
                this_cpu->c_early_init(c);
 
-#ifdef CONFIG_SMP
        c->cpu_index = 0;
-#endif
        filter_cpuid_features(c, false);
 
        setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
                c->apicid = c->initial_apicid;
 # endif
 #endif
-
-#ifdef CONFIG_X86_HT
                c->phys_proc_id = c->initial_apicid;
-#endif
        }
 
        setup_smep(c);
@@ -1140,6 +1135,15 @@ static void dbg_restore_debug_regs(void)
 #define dbg_restore_debug_regs()
 #endif /* ! CONFIG_KGDB */
 
+/*
+ * Prints an error where the NUMA and configured core-number mismatch and the
+ * platform didn't override this to fix it up
+ */
+void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+       pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
index 1b22dcc..8bacc78 100644 (file)
@@ -1,5 +1,4 @@
 #ifndef ARCH_X86_CPU_H
-
 #define ARCH_X86_CPU_H
 
 struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
 
 extern void get_cpu_cap(struct cpuinfo_x86 *c);
 extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
-extern void get_cpu_cap(struct cpuinfo_x86 *c);
-
-#endif
+#endif /* ARCH_X86_CPU_H */
index 5231312..3e6ff6c 100644 (file)
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
 
 static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_SMP
        /* calling is from identify_secondary_cpu() ? */
        if (!c->cpu_index)
                return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
                WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
                                    "with B stepping processors.\n");
        }
-#endif
 }
 
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
index 319882e..fc4beb3 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/fs.h>
+#include <linux/preempt.h>
 #include <linux/smp.h>
 #include <linux/notifier.h>
 #include <linux/kdebug.h>
@@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
        return NMI_HANDLED;
 }
 
+static void mce_irq_ipi(void *info)
+{
+       int cpu = smp_processor_id();
+       struct mce *m = &__get_cpu_var(injectm);
+
+       if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+                       m->inject_flags & MCJ_EXCEPTION) {
+               cpumask_clear_cpu(cpu, mce_inject_cpumask);
+               raise_exception(m, NULL);
+       }
+}
+
 /* Inject mce on current CPU */
 static int raise_local(void)
 {
@@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)
                return;
 
 #ifdef CONFIG_X86_LOCAL_APIC
-       if (m->inject_flags & MCJ_NMI_BROADCAST) {
+       if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
                unsigned long start;
                int cpu;
+
                get_online_cpus();
                cpumask_copy(mce_inject_cpumask, cpu_online_mask);
                cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)
                            MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
                                cpumask_clear_cpu(cpu, mce_inject_cpumask);
                }
-               if (!cpumask_empty(mce_inject_cpumask))
-                       apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+               if (!cpumask_empty(mce_inject_cpumask)) {
+                       if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
+                               /*
+                                * don't wait because mce_irq_ipi is necessary
+                                * to be sync with following raise_local
+                                */
+                               preempt_disable();
+                               smp_call_function_many(mce_inject_cpumask,
+                                       mce_irq_ipi, NULL, 0);
+                               preempt_enable();
+                       } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+                               apic->send_IPI_mask(mce_inject_cpumask,
+                                               NMI_VECTOR);
+               }
                start = jiffies;
                while (!cpumask_empty(mce_inject_cpumask)) {
                        if (!time_before(jiffies, start + 2*HZ)) {
                                printk(KERN_ERR
-                               "Timeout waiting for mce inject NMI %lx\n",
+                               "Timeout waiting for mce inject %lx\n",
                                        *cpumask_bits(mce_inject_cpumask));
                                break;
                        }
index 2af127d..cbe82b5 100644 (file)
@@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int                     cpu_missing;
 
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
@@ -119,9 +118,7 @@ void mce_setup(struct mce *m)
        m->time = get_seconds();
        m->cpuvendor = boot_cpu_data.x86_vendor;
        m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
        m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
        m->apicid = cpu_data(m->extcpu).initial_apicid;
        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 }
@@ -190,6 +187,57 @@ void mce_log(struct mce *mce)
        set_bit(0, &mce_need_notify);
 }
 
+static void drain_mcelog_buffer(void)
+{
+       unsigned int next, i, prev = 0;
+
+       next = rcu_dereference_check_mce(mcelog.next);
+
+       do {
+               struct mce *m;
+
+               /* drain what was logged during boot */
+               for (i = prev; i < next; i++) {
+                       unsigned long start = jiffies;
+                       unsigned retries = 1;
+
+                       m = &mcelog.entry[i];
+
+                       while (!m->finished) {
+                               if (time_after_eq(jiffies, start + 2*retries))
+                                       retries++;
+
+                               cpu_relax();
+
+                               if (!m->finished && retries >= 4) {
+                                       pr_err("MCE: skipping error being logged currently!\n");
+                                       break;
+                               }
+                       }
+                       smp_rmb();
+                       atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+               }
+
+               memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
+               prev = next;
+               next = cmpxchg(&mcelog.next, prev, 0);
+       } while (next != prev);
+}
+
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+       atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+       drain_mcelog_buffer();
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
 static void print_mce(struct mce *m)
 {
        int ret = 0;
index f547421..1d76872 100644 (file)
@@ -64,11 +64,9 @@ struct threshold_bank {
 };
 static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
 
-#ifdef CONFIG_SMP
 static unsigned char shared_bank[NR_BANKS] = {
        0, 0, 0, 0, 1
 };
-#endif
 
 static DEFINE_PER_CPU(unsigned char, bank_map);        /* see which banks are on */
 
@@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
                        if (!block)
                                per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
                        if (shared_bank[bank] && c->cpu_core_id)
                                break;
-#endif
+
                        offset = setup_APIC_mce(offset,
                                                (high & MASK_LVTOFF_HI) >> 20);
 
@@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
        sprintf(name, "threshold_bank%i", bank);
 
-#ifdef CONFIG_SMP
        if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
                i = cpumask_first(cpu_llc_shared_mask(cpu));
 
@@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
                goto out;
        }
-#endif
 
        b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
        if (!b) {
index 787e06c..39c6089 100644 (file)
@@ -323,17 +323,6 @@ device_initcall(thermal_throttle_init_device);
 
 #endif /* CONFIG_SYSFS */
 
-/*
- * Set up the most two significant bit to notify mce log that this thermal
- * event type.
- * This is a temp solution. May be changed in the future with mce log
- * infrasture.
- */
-#define CORE_THROTTLED         (0)
-#define CORE_POWER_LIMIT       ((__u64)1 << 62)
-#define PACKAGE_THROTTLED      ((__u64)2 << 62)
-#define PACKAGE_POWER_LIMIT    ((__u64)3 << 62)
-
 static void notify_thresholds(__u64 msr_val)
 {
        /* check whether the interrupt handler is defined;
@@ -363,27 +352,23 @@ static void intel_thermal_interrupt(void)
        if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
                                THERMAL_THROTTLING_EVENT,
                                CORE_LEVEL) != 0)
-               mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+               mce_log_therm_throt_event(msr_val);
 
        if (this_cpu_has(X86_FEATURE_PLN))
-               if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+               therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
-                                       CORE_LEVEL) != 0)
-                       mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+                                       CORE_LEVEL);
 
        if (this_cpu_has(X86_FEATURE_PTS)) {
                rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
-               if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+               therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
                                        THERMAL_THROTTLING_EVENT,
-                                       PACKAGE_LEVEL) != 0)
-                       mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+                                       PACKAGE_LEVEL);
                if (this_cpu_has(X86_FEATURE_PLN))
-                       if (therm_throt_process(msr_val &
+                       therm_throt_process(msr_val &
                                        PACKAGE_THERM_STATUS_POWER_LIMIT,
                                        POWER_LIMIT_EVENT,
-                                       PACKAGE_LEVEL) != 0)
-                               mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
-                                                         | msr_val);
+                                       PACKAGE_LEVEL);
        }
 }
 
@@ -397,8 +382,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
 
 asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
 {
-       exit_idle();
        irq_enter();
+       exit_idle();
        inc_irq_stat(irq_thermal_count);
        smp_thermal_vector();
        irq_exit();
index d746df2..aa578ca 100644 (file)
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
 
 asmlinkage void smp_threshold_interrupt(void)
 {
-       exit_idle();
        irq_enter();
+       exit_idle();
        inc_irq_stat(irq_threshold_count);
        mce_threshold_vector();
        irq_exit();
index 2bda212..5adce10 100644 (file)
@@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
        return event->pmu == &pmu;
 }
 
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+       int     weight;
+       int     event;          /* event index */
+       int     counter;        /* counter index */
+       int     unassigned;     /* number of events to be assigned left */
+       unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define        SCHED_STATES_MAX        2
+
+struct perf_sched {
+       int                     max_weight;
+       int                     max_events;
+       struct event_constraint **constraints;
+       struct sched_state      state;
+       int                     saved_states;
+       struct sched_state      saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
+                           int num, int wmin, int wmax)
+{
+       int idx;
+
+       memset(sched, 0, sizeof(*sched));
+       sched->max_events       = num;
+       sched->max_weight       = wmax;
+       sched->constraints      = c;
+
+       for (idx = 0; idx < num; idx++) {
+               if (c[idx]->weight == wmin)
+                       break;
+       }
+
+       sched->state.event      = idx;          /* start with min weight */
+       sched->state.weight     = wmin;
+       sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+       if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+               return;
+
+       sched->saved[sched->saved_states] = sched->state;
+       sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+       if (!sched->saved_states)
+               return false;
+
+       sched->saved_states--;
+       sched->state = sched->saved[sched->saved_states];
+
+       /* continue with next counter: */
+       clear_bit(sched->state.counter++, sched->state.used);
+
+       return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+       int idx;
+
+       if (!sched->state.unassigned)
+               return false;
+
+       if (sched->state.event >= sched->max_events)
+               return false;
+
+       c = sched->constraints[sched->state.event];
+
+       /* Prefer fixed purpose counters */
+       if (x86_pmu.num_counters_fixed) {
+               idx = X86_PMC_IDX_FIXED;
+               for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+                       if (!__test_and_set_bit(idx, sched->state.used))
+                               goto done;
+               }
+       }
+       /* Grab the first unused counter starting with idx */
+       idx = sched->state.counter;
+       for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+               if (!__test_and_set_bit(idx, sched->state.used))
+                       goto done;
+       }
+
+       return false;
+
+done:
+       sched->state.counter = idx;
+
+       if (c->overlap)
+               perf_sched_save_state(sched);
+
+       return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+       while (!__perf_sched_find_counter(sched)) {
+               if (!perf_sched_restore_state(sched))
+                       return false;
+       }
+
+       return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+       struct event_constraint *c;
+
+       if (!sched->state.unassigned || !--sched->state.unassigned)
+               return false;
+
+       do {
+               /* next event */
+               sched->state.event++;
+               if (sched->state.event >= sched->max_events) {
+                       /* next weight */
+                       sched->state.event = 0;
+                       sched->state.weight++;
+                       if (sched->state.weight > sched->max_weight)
+                               return false;
+               }
+               c = sched->constraints[sched->state.event];
+       } while (c->weight != sched->state.weight);
+
+       sched->state.counter = 0;       /* start with first counter */
+
+       return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+static int perf_assign_events(struct event_constraint **constraints, int n,
+                             int wmin, int wmax, int *assign)
+{
+       struct perf_sched sched;
+
+       perf_sched_init(&sched, constraints, n, wmin, wmax);
+
+       do {
+               if (!perf_sched_find_counter(&sched))
+                       break;  /* failed */
+               if (assign)
+                       assign[sched.state.event] = sched.state.counter;
+       } while (perf_sched_next_event(&sched));
+
+       return sched.state.unassigned;
+}
+
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
        struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
        unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-       int i, j, w, wmax, num = 0;
+       int i, wmin, wmax, num = 0;
        struct hw_perf_event *hwc;
 
        bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
-       for (i = 0; i < n; i++) {
+       for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
                c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
                constraints[i] = c;
+               wmin = min(wmin, c->weight);
+               wmax = max(wmax, c->weight);
        }
 
        /*
@@ -521,59 +698,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
                if (assign)
                        assign[i] = hwc->idx;
        }
-       if (i == n)
-               goto done;
-
-       /*
-        * begin slow path
-        */
-
-       bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
-       /*
-        * weight = number of possible counters
-        *
-        * 1    = most constrained, only works on one counter
-        * wmax = least constrained, works on any counter
-        *
-        * assign events to counters starting with most
-        * constrained events.
-        */
-       wmax = x86_pmu.num_counters;
+       /* slow path */
+       if (i != n)
+               num = perf_assign_events(constraints, n, wmin, wmax, assign);
 
-       /*
-        * when fixed event counters are present,
-        * wmax is incremented by 1 to account
-        * for one more choice
-        */
-       if (x86_pmu.num_counters_fixed)
-               wmax++;
-
-       for (w = 1, num = n; num && w <= wmax; w++) {
-               /* for each event */
-               for (i = 0; num && i < n; i++) {
-                       c = constraints[i];
-                       hwc = &cpuc->event_list[i]->hw;
-
-                       if (c->weight != w)
-                               continue;
-
-                       for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
-                               if (!test_bit(j, used_mask))
-                                       break;
-                       }
-
-                       if (j == X86_PMC_IDX_MAX)
-                               break;
-
-                       __set_bit(j, used_mask);
-
-                       if (assign)
-                               assign[i] = j;
-                       num--;
-               }
-       }
-done:
        /*
         * scheduling failed or is just a simulation,
         * free resources if necessary
@@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void)
 
 static int __init init_hw_perf_events(void)
 {
+       struct x86_pmu_quirk *quirk;
        struct event_constraint *c;
        int err;
 
@@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void)
 
        pr_cont("%s PMU driver.\n", x86_pmu.name);
 
-       if (x86_pmu.quirks)
-               x86_pmu.quirks();
+       for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+               quirk->func();
 
        if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
                WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void)
 
        unconstrained = (struct event_constraint)
                __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-                                  0, x86_pmu.num_counters);
+                                  0, x86_pmu.num_counters, 0);
 
        if (x86_pmu.event_constraints) {
+               /*
+                * event on fixed counter2 (REF_CYCLES) only works on this
+                * counter, so do not extend mask to generic counters
+                */
                for_each_event_constraint(c, x86_pmu.event_constraints) {
-                       if (c->cmask != X86_RAW_EVENT_MASK)
+                       if (c->cmask != X86_RAW_EVENT_MASK
+                           || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
                                continue;
+                       }
 
                        c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
                        c->weight += x86_pmu.num_counters;
@@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
 
        return misc;
 }
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+       cap->version            = x86_pmu.version;
+       cap->num_counters_gp    = x86_pmu.num_counters;
+       cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+       cap->bit_width_gp       = x86_pmu.cntval_bits;
+       cap->bit_width_fixed    = x86_pmu.cntval_bits;
+       cap->events_mask        = (unsigned int)x86_pmu.events_maskl;
+       cap->events_mask_len    = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
index b9698d4..8944062 100644 (file)
@@ -45,6 +45,7 @@ struct event_constraint {
        u64     code;
        u64     cmask;
        int     weight;
+       int     overlap;
 };
 
 struct amd_nb {
@@ -151,15 +152,40 @@ struct cpu_hw_events {
        void                            *kfree_on_online;
 };
 
-#define __EVENT_CONSTRAINT(c, n, m, w) {\
+#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
        { .idxmsk64 = (n) },            \
        .code = (c),                    \
        .cmask = (m),                   \
        .weight = (w),                  \
+       .overlap = (o),                 \
 }
 
 #define EVENT_CONSTRAINT(c, n, m)      \
-       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)      \
+       __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
 
 /*
  * Constraint on the Event code.
@@ -235,6 +261,11 @@ union perf_capabilities {
        u64     capabilities;
 };
 
+struct x86_pmu_quirk {
+       struct x86_pmu_quirk *next;
+       void (*func)(void);
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -259,6 +290,11 @@ struct x86_pmu {
        int             num_counters_fixed;
        int             cntval_bits;
        u64             cntval_mask;
+       union {
+                       unsigned long events_maskl;
+                       unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+       };
+       int             events_mask_len;
        int             apic;
        u64             max_period;
        struct event_constraint *
@@ -268,7 +304,7 @@ struct x86_pmu {
        void            (*put_event_constraints)(struct cpu_hw_events *cpuc,
                                                 struct perf_event *event);
        struct event_constraint *event_constraints;
-       void            (*quirks)(void);
+       struct x86_pmu_quirk *quirks;
        int             perfctr_second_write;
 
        int             (*cpu_prepare)(int cpu);
@@ -309,6 +345,15 @@ struct x86_pmu {
        struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+#define x86_add_quirk(func_)                                           \
+do {                                                                   \
+       static struct x86_pmu_quirk __quirk __initdata = {              \
+               .func = func_,                                          \
+       };                                                              \
+       __quirk.next = x86_pmu.quirks;                                  \
+       x86_pmu.quirks = &__quirk;                                      \
+} while (0)
+
 #define ERF_NO_HT_SHARING      1
 #define ERF_HAS_RSP_1          2
 
index aeefd45..0397b23 100644 (file)
@@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {
 static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
 static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
 static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
 static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 
index 121f1be..3bd37bd 100644 (file)
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]  = 0x00c4,
   [PERF_COUNT_HW_BRANCH_MISSES]                = 0x00c5,
   [PERF_COUNT_HW_BUS_CYCLES]           = 0x013c,
+  [PERF_COUNT_HW_REF_CPU_CYCLES]       = 0x0300, /* pseudo-encoding */
 };
 
 static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       /*
-        * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
-        * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
-        * ratio between these counters.
-        */
-       /* FIXED_EVENT_CONSTRAINT(0x013c, 2),  CPU_CLK_UNHALTED.REF */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
        INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
        INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
        INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
        INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
        INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
        INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
        INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
        INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 {
        FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
        FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-       /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+       FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
        EVENT_CONSTRAINT_END
 };
 
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
        .guest_get_msrs         = intel_guest_get_msrs,
 };
 
-static void intel_clovertown_quirks(void)
+static __init void intel_clovertown_quirk(void)
 {
        /*
         * PEBS is unreliable due to:
@@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void)
        x86_pmu.pebs_constraints = NULL;
 }
 
-static void intel_sandybridge_quirks(void)
+static __init void intel_sandybridge_quirk(void)
 {
        printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
        x86_pmu.pebs = 0;
        x86_pmu.pebs_constraints = NULL;
 }
 
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+       { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+       { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+       { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+       { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+       { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+       { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+       { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+       int bit;
+
+       /* disable event that reported as not presend by cpuid */
+       for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+               intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+               printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
+                               intel_arch_events_map[bit].name);
+       }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+       union cpuid10_ebx ebx;
+
+       ebx.full = x86_pmu.events_maskl;
+       if (ebx.split.no_branch_misses_retired) {
+               /*
+                * Erratum AAJ80 detected, we work it around by using
+                * the BR_MISP_EXEC.ANY event. This will over-count
+                * branch-misses, but it's still much better than the
+                * architectural event which is often completely bogus:
+                */
+               intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+               ebx.split.no_branch_misses_retired = 0;
+               x86_pmu.events_maskl = ebx.full;
+               printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
+       }
+}
+
 __init int intel_pmu_init(void)
 {
        union cpuid10_edx edx;
        union cpuid10_eax eax;
+       union cpuid10_ebx ebx;
        unsigned int unused;
-       unsigned int ebx;
        int version;
 
        if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void)
         * Check whether the Architectural PerfMon supports
         * Branch Misses Retired hw_event or not.
         */
-       cpuid(10, &eax.full, &ebx, &unused, &edx.full);
-       if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+       cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+       if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
                return -ENODEV;
 
        version = eax.split.version_id;
@@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void)
        x86_pmu.cntval_bits             = eax.split.bit_width;
        x86_pmu.cntval_mask             = (1ULL << eax.split.bit_width) - 1;
 
+       x86_pmu.events_maskl            = ebx.full;
+       x86_pmu.events_mask_len         = eax.split.mask_length;
+
        /*
         * Quirk: v2 perfmon does not report fixed-purpose events, so
         * assume at least 3 events:
@@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void)
 
        intel_ds_init();
 
+       x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
        /*
         * Install the hw-cache-events table:
         */
@@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void)
                break;
 
        case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
-               x86_pmu.quirks = intel_clovertown_quirks;
+               x86_add_quirk(intel_clovertown_quirk);
        case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
        case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
        case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void)
                /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
                intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
 
-               if (ebx & 0x40) {
-                       /*
-                        * Erratum AAJ80 detected, we work it around by using
-                        * the BR_MISP_EXEC.ANY event. This will over-count
-                        * branch-misses, but it's still much better than the
-                        * architectural event which is often completely bogus:
-                        */
-                       intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+               x86_add_quirk(intel_nehalem_quirk);
 
-                       pr_cont("erratum AAJ80 worked around, ");
-               }
                pr_cont("Nehalem events, ");
                break;
 
@@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void)
                break;
 
        case 42: /* SandyBridge */
-               x86_pmu.quirks = intel_sandybridge_quirks;
+               x86_add_quirk(intel_sandybridge_quirk);
        case 45: /* SandyBridge, "Romely-EP" */
                memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
                       sizeof(hw_cache_event_ids));
@@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void)
                        break;
                }
        }
+
        return 0;
 }
index 5abbea2..7b3fe56 100644 (file)
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
        "100mhzsteps",
        "hwpstate",
        "",     /* tsc invariant mapped to constant_tsc */
-               /* nothing */
+       "cpb",  /* core performance boost */
+       "eff_freq_ro", /* Readonly aperf/mperf */
 };
index 14b2314..8022c66 100644 (file)
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
        struct cpuinfo_x86 *c = v;
-       unsigned int cpu = 0;
+       unsigned int cpu;
        int i;
 
-#ifdef CONFIG_SMP
        cpu = c->cpu_index;
-#endif
        seq_printf(m, "processor\t: %u\n"
                   "vendor_id\t: %s\n"
                   "cpu family\t: %d\n"
index 303a0e4..8071e2f 100644 (file)
@@ -738,35 +738,17 @@ core_initcall(e820_mark_nvs_memory);
 /*
  * pre allocated 4k and reserved it in memblock and e820_saved
  */
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init early_reserve_e820(u64 size, u64 align)
 {
-       u64 size = 0;
        u64 addr;
-       u64 start;
 
-       for (start = startt; ; start += size) {
-               start = memblock_x86_find_in_range_size(start, &size, align);
-               if (start == MEMBLOCK_ERROR)
-                       return 0;
-               if (size >= sizet)
-                       break;
+       addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+       if (addr) {
+               e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+               printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+               update_e820_saved();
        }
 
-#ifdef CONFIG_X86_32
-       if (start >= MAXMEM)
-               return 0;
-       if (start + size > MAXMEM)
-               size = MAXMEM - start;
-#endif
-
-       addr = round_down(start + size - sizet, align);
-       if (addr < start)
-               return 0;
-       memblock_x86_reserve_range(addr, addr + sizet, "new next");
-       e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
-       printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
-       update_e820_saved();
-
        return addr;
 }
 
@@ -1090,7 +1072,7 @@ void __init memblock_x86_fill(void)
         * We are safe to enable resizing, beause memblock_x86_fill()
         * is rather later for x86
         */
-       memblock_can_resize = 1;
+       memblock_allow_resize();
 
        for (i = 0; i < e820.nr_map; i++) {
                struct e820entry *ei = &e820.map[i];
@@ -1105,22 +1087,36 @@ void __init memblock_x86_fill(void)
                memblock_add(ei->addr, ei->size);
        }
 
-       memblock_analyze();
        memblock_dump_all();
 }
 
 void __init memblock_find_dma_reserve(void)
 {
 #ifdef CONFIG_X86_64
-       u64 free_size_pfn;
-       u64 mem_size_pfn;
+       u64 nr_pages = 0, nr_free_pages = 0;
+       unsigned long start_pfn, end_pfn;
+       phys_addr_t start, end;
+       int i;
+       u64 u;
+
        /*
         * need to find out used area below MAX_DMA_PFN
         * need to use memblock to get free size in [0, MAX_DMA_PFN]
         * at first, and assume boot_mem will not take below MAX_DMA_PFN
         */
-       mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
-       free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
-       set_dma_reserve(mem_size_pfn - free_size_pfn);
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+               end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+               nr_pages += end_pfn - start_pfn;
+       }
+
+       for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+               start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+               end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+               if (start_pfn < end_pfn)
+                       nr_free_pages += end_pfn - start_pfn;
+       }
+
+       set_dma_reserve(nr_pages - nr_free_pages);
 #endif
 }
index f3f6f53..22d0e21 100644 (file)
@@ -625,6 +625,8 @@ work_notifysig:                             # deal with pending signals and
        movl %esp, %eax
        jne work_notifysig_v86          # returning to kernel-space or
                                        # vm86-space
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
        xorl %edx, %edx
        call do_notify_resume
        jmp resume_userspace_sig
@@ -638,6 +640,8 @@ work_notifysig_v86:
 #else
        movl %esp, %eax
 #endif
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
        xorl %edx, %edx
        call do_notify_resume
        jmp resume_userspace_sig
index faf8d5e..a20e1cb 100644 (file)
@@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
        /*CFI_REL_OFFSET        ss,0*/
        pushq_cfi %rax /* rsp */
        CFI_REL_OFFSET  rsp,0
-       pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+       pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
        /*CFI_REL_OFFSET        rflags,0*/
        pushq_cfi $__KERNEL_CS /* cs */
        /*CFI_REL_OFFSET        cs,0*/
@@ -411,7 +411,7 @@ ENTRY(ret_from_fork)
        RESTORE_REST
 
        testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
-       je   int_ret_from_sys_call
+       jz   retint_restore_args
 
        testl $_TIF_IA32, TI_flags(%rcx)        # 32-bit compat task needs IRET
        jnz  int_ret_from_sys_call
@@ -465,7 +465,7 @@ ENTRY(system_call)
         * after the swapgs, so that it can do the swapgs
         * for the guest and jump here on syscall.
         */
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
 
        movq    %rsp,PER_CPU_VAR(old_rsp)
        movq    PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)
        movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
        movq  %rcx,RIP-ARGOFFSET(%rsp)
        CFI_REL_OFFSET rip,RIP-ARGOFFSET
-       GET_THREAD_INFO(%rcx)
-       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jnz tracesys
 system_call_fastpath:
        cmpq $__NR_syscall_max,%rax
@@ -496,10 +495,9 @@ ret_from_sys_call:
        /* edi: flagmask */
 sysret_check:
        LOCKDEP_SYS_EXIT
-       GET_THREAD_INFO(%rcx)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       movl TI_flags(%rcx),%edx
+       movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
        andl %edi,%edx
        jnz  sysret_careful
        CFI_REMEMBER_STATE
@@ -583,7 +581,7 @@ sysret_audit:
        /* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
        jz auditsys
 #endif
        SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
 GLOBAL(int_ret_from_sys_call)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl $3,CS-ARGOFFSET(%rsp)
-       je retint_restore_args
        movl $_TIF_ALLWORK_MASK,%edi
        /* edi: mask to check */
 GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
 ENTRY(\sym)
        INTR_FRAME
        pushq_cfi $~(\num)
+.Lcommon_\sym:
        interrupt \do_sym
        jmp ret_from_intr
        CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
        x86_platform_ipi smp_x86_platform_ipi
 
 #ifdef CONFIG_SMP
-.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+       ALIGN
+       INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
        16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
 .if NUM_INVALIDATE_TLB_VECTORS > \idx
-apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
-       invalidate_interrupt\idx smp_invalidate_interrupt
+ENTRY(invalidate_interrupt\idx)
+       pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+       jmp .Lcommon_invalidate_interrupt0
+       CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
 .endif
 .endr
+       CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+       invalidate_interrupt0, smp_invalidate_interrupt
 #endif
 
 apicinterrupt THRESHOLD_APIC_VECTOR \
index af0699b..48d9d4e 100644 (file)
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
                lowmem = 0x9f000;
 
        /* reserve all memory between lowmem and the 1MB mark */
-       memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
+       memblock_reserve(lowmem, 0x100000 - lowmem);
 }
index 3bb0850..51ff186 100644 (file)
@@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)
 
 void __init i386_start_kernel(void)
 {
-       memblock_init();
-
-       memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+       memblock_reserve(__pa_symbol(&_text),
+                        __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
@@ -42,7 +41,7 @@ void __init i386_start_kernel(void)
                u64 ramdisk_image = boot_params.hdr.ramdisk_image;
                u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
                u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-               memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+               memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
        }
 #endif
 
index 5655c22..3a3b779 100644 (file)
@@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
 {
        copy_bootdata(__va(real_mode_data));
 
-       memblock_init();
-
-       memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+       memblock_reserve(__pa_symbol(&_text),
+                        __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
 #ifdef CONFIG_BLK_DEV_INITRD
        /* Reserve INITRD */
@@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
                unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
                unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
                unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-               memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+               memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
        }
 #endif
 
index 1bb0bf4..07b0a56 100644 (file)
@@ -32,8 +32,6 @@
 #define HPET_MIN_CYCLES                        128
 #define HPET_MIN_PROG_DELTA            (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
 
-#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
-
 /*
  * HPET address is set in acpi/boot.c, when an ACPI entry exists
  */
@@ -55,6 +53,11 @@ struct hpet_dev {
        char                            name[10];
 };
 
+inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+{
+       return container_of(evtdev, struct hpet_dev, evt);
+}
+
 inline unsigned int hpet_readl(unsigned int a)
 {
        return readl(hpet_virt_address + a);
index 429e0c9..7943e0c 100644 (file)
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
        seq_printf(p, "  IRQ work interrupts\n");
+       seq_printf(p, "%*s: ", prec, "RTR");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+       seq_printf(p, "  APIC ICR read retries\n");
 #endif
        if (x86_platform_ipi_callback) {
                seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
        sum += irq_stats(cpu)->irq_spurious_count;
        sum += irq_stats(cpu)->apic_perf_irqs;
        sum += irq_stats(cpu)->apic_irq_work_irqs;
+       sum += irq_stats(cpu)->icr_read_retry_count;
 #endif
        if (x86_platform_ipi_callback)
                sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        unsigned vector = ~regs->orig_ax;
        unsigned irq;
 
-       exit_idle();
        irq_enter();
+       exit_idle();
 
        irq = __this_cpu_read(vector_irq[vector]);
 
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
 
        ack_APIC_irq();
 
-       exit_idle();
-
        irq_enter();
 
+       exit_idle();
+
        inc_irq_stat(x86_platform_ipis);
 
        if (x86_platform_ipi_callback)
index ea9d5f2..2889b3d 100644 (file)
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
        put_online_cpus();
 }
 
-void arch_jump_label_transform_static(struct jump_entry *entry,
+__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
                                      enum jump_label_type type)
 {
        __jump_label_transform(entry, type, text_poke_early);
index d494799..fe86493 100644 (file)
@@ -1,14 +1,18 @@
 /*
  *  AMD CPU Microcode Update Driver for Linux
- *  Copyright (C) 2008 Advanced Micro Devices Inc.
+ *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
  *
  *  Author: Peter Oruba <peter.oruba@amd.com>
  *
  *  Based on work by:
  *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
  *
- *  This driver allows to upgrade microcode on AMD
- *  family 0x10 and 0x11 processors.
+ *  Maintainers:
+ *  Andreas Herrmann <andreas.herrmann3@amd.com>
+ *  Borislav Petkov <borislav.petkov@amd.com>
+ *
+ *  This driver allows to upgrade microcode on F10h AMD
+ *  CPUs and later.
  *
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
@@ -71,6 +75,9 @@ struct microcode_amd {
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
+/* page-sized ucode patch buffer */
+void *patch;
+
 static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
 {
        struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
        return 0;
 }
 
-static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
-                                 int rev)
+static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+                                     unsigned int size)
 {
-       unsigned int current_cpu_id;
-       u16 equiv_cpu_id = 0;
-       unsigned int i = 0;
+       struct cpuinfo_x86 *c = &cpu_data(cpu);
+       u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+
+       switch (c->x86) {
+       case 0x14:
+               max_size = F14H_MPB_MAX_SIZE;
+               break;
+       case 0x15:
+               max_size = F15H_MPB_MAX_SIZE;
+               break;
+       default:
+               max_size = F1XH_MPB_MAX_SIZE;
+               break;
+       }
+
+       if (patch_size > min_t(u32, size, max_size)) {
+               pr_err("patch size mismatch\n");
+               return 0;
+       }
+
+       return patch_size;
+}
+
+static u16 find_equiv_id(void)
+{
+       unsigned int current_cpu_id, i = 0;
 
        BUG_ON(equiv_cpu_table == NULL);
+
        current_cpu_id = cpuid_eax(0x00000001);
 
        while (equiv_cpu_table[i].installed_cpu != 0) {
-               if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
-                       equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
-                       break;
-               }
+               if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
+                       return equiv_cpu_table[i].equiv_cpu;
+
                i++;
        }
+       return 0;
+}
 
+/*
+ * we signal a good patch is found by returning its size > 0
+ */
+static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
+                                 unsigned int leftover_size, int rev,
+                                 unsigned int *current_size)
+{
+       struct microcode_header_amd *mc_hdr;
+       unsigned int actual_size;
+       u16 equiv_cpu_id;
+
+       /* size of the current patch we're staring at */
+       *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+
+       equiv_cpu_id = find_equiv_id();
        if (!equiv_cpu_id)
                return 0;
 
+       /*
+        * let's look at the patch header itself now
+        */
+       mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+
        if (mc_hdr->processor_rev_id != equiv_cpu_id)
                return 0;
 
@@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
        if (mc_hdr->patch_id <= rev)
                return 0;
 
-       return 1;
+       /*
+        * now that the header looks sane, verify its size
+        */
+       actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+       if (!actual_size)
+               return 0;
+
+       /* clear the patch buffer */
+       memset(patch, 0, PAGE_SIZE);
+
+       /* all looks ok, get the binary patch */
+       get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
+
+       return actual_size;
 }
 
 static int apply_microcode_amd(int cpu)
@@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)
        return 0;
 }
 
-static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
-{
-       struct cpuinfo_x86 *c = &cpu_data(cpu);
-       u32 max_size, actual_size;
-
-#define F1XH_MPB_MAX_SIZE 2048
-#define F14H_MPB_MAX_SIZE 1824
-#define F15H_MPB_MAX_SIZE 4096
-
-       switch (c->x86) {
-       case 0x14:
-               max_size = F14H_MPB_MAX_SIZE;
-               break;
-       case 0x15:
-               max_size = F15H_MPB_MAX_SIZE;
-               break;
-       default:
-               max_size = F1XH_MPB_MAX_SIZE;
-               break;
-       }
-
-       actual_size = *(u32 *)(buf + 4);
-
-       if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
-               pr_err("section size mismatch\n");
-               return 0;
-       }
-
-       return actual_size;
-}
-
-static struct microcode_header_amd *
-get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
-{
-       struct microcode_header_amd *mc = NULL;
-       unsigned int actual_size = 0;
-
-       if (*(u32 *)buf != UCODE_UCODE_TYPE) {
-               pr_err("invalid type field in container file section header\n");
-               goto out;
-       }
-
-       actual_size = verify_ucode_size(cpu, buf, size);
-       if (!actual_size)
-               goto out;
-
-       mc = vzalloc(actual_size);
-       if (!mc)
-               goto out;
-
-       get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
-       *mc_size = actual_size + SECTION_HDR_SIZE;
-
-out:
-       return mc;
-}
-
 static int install_equiv_cpu_table(const u8 *buf)
 {
        unsigned int *ibuf = (unsigned int *)buf;
@@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
        struct microcode_header_amd *mc_hdr = NULL;
-       unsigned int mc_size, leftover;
+       unsigned int mc_size, leftover, current_size = 0;
        int offset;
        const u8 *ucode_ptr = data;
        void *new_mc = NULL;
        unsigned int new_rev = uci->cpu_sig.rev;
-       enum ucode_state state = UCODE_OK;
+       enum ucode_state state = UCODE_ERROR;
 
        offset = install_equiv_cpu_table(ucode_ptr);
        if (offset < 0) {
                pr_err("failed to create equivalent cpu table\n");
-               return UCODE_ERROR;
+               goto out;
        }
-
        ucode_ptr += offset;
        leftover = size - offset;
 
-       while (leftover) {
-               mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
-               if (!mc_hdr)
-                       break;
+       if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+               pr_err("invalid type field in container file section header\n");
+               goto free_table;
+       }
 
-               if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
-                       vfree(new_mc);
+       while (leftover) {
+               mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
+                                                new_rev, &current_size);
+               if (mc_size) {
+                       mc_hdr  = patch;
+                       new_mc  = patch;
                        new_rev = mc_hdr->patch_id;
-                       new_mc  = mc_hdr;
-               } else
-                       vfree(mc_hdr);
+                       goto out_ok;
+               }
 
-               ucode_ptr += mc_size;
-               leftover  -= mc_size;
+               ucode_ptr += current_size;
+               leftover  -= current_size;
        }
 
        if (!new_mc) {
@@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
                goto free_table;
        }
 
-       if (!leftover) {
-               vfree(uci->mc);
-               uci->mc = new_mc;
-               pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
-                        cpu, uci->cpu_sig.rev, new_rev);
-       } else {
-               vfree(new_mc);
-               state = UCODE_ERROR;
-       }
+out_ok:
+       uci->mc = new_mc;
+       state = UCODE_OK;
+       pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
+                cpu, uci->cpu_sig.rev, new_rev);
 
 free_table:
        free_equiv_cpu_table();
 
+out:
        return state;
 }
 
@@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-       vfree(uci->mc);
        uci->mc = NULL;
 }
 
@@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = {
 
 struct microcode_ops * __init init_amd_microcode(void)
 {
+       patch = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!patch)
+               return NULL;
+
        return &microcode_amd_ops;
 }
+
+void __exit exit_amd_microcode(void)
+{
+       free_page((unsigned long)patch);
+}
index 9d46f5e..9302e2d 100644 (file)
@@ -563,6 +563,8 @@ module_init(microcode_init);
 
 static void __exit microcode_exit(void)
 {
+       struct cpuinfo_x86 *c = &cpu_data(0);
+
        microcode_dev_exit();
 
        unregister_hotcpu_notifier(&mc_cpu_notifier);
@@ -580,6 +582,9 @@ static void __exit microcode_exit(void)
 
        microcode_ops = NULL;
 
+       if (c->x86_vendor == X86_VENDOR_AMD)
+               exit_amd_microcode();
+
        pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
 }
 module_exit(microcode_exit);
index 0741b06..ca470e4 100644 (file)
@@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)
 
 static void __init smp_reserve_memory(struct mpf_intel *mpf)
 {
-       unsigned long size = get_mpc_size(mpf->physptr);
-
-       memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
+       memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
 }
 
 static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
                               mpf, (u64)virt_to_phys(mpf));
 
                        mem = virt_to_phys(mpf);
-                       memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
+                       memblock_reserve(mem, sizeof(*mpf));
                        if (mpf->physptr)
                                smp_reserve_memory(mpf);
 
@@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
 
 void __init early_reserve_e820_mpc_new(void)
 {
-       if (enable_update_mptable && alloc_mptable) {
-               u64 startt = 0;
-               mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
-       }
+       if (enable_update_mptable && alloc_mptable)
+               mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
 }
 
 static int __init update_mp_table(void)
index ee5d4fb..15763af 100644 (file)
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
        regs.orig_ax = -1;
        regs.ip = (unsigned long) kernel_thread_helper;
        regs.cs = __KERNEL_CS | get_kernel_rpl();
-       regs.flags = X86_EFLAGS_IF | 0x2;
+       regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
 
        /* Ok, create the new process.. */
        return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
index 795b79f..485204f 100644 (file)
@@ -99,7 +99,8 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
+               rcu_idle_enter();
                while (!need_resched()) {
 
                        check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
                                pm_idle();
                        start_critical_timings();
                }
-               tick_nohz_restart_sched_tick();
+               rcu_idle_exit();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
index 3bd7e6e..9b9fe4a 100644 (file)
@@ -122,7 +122,7 @@ void cpu_idle(void)
 
        /* endless idle loop with no priority at all */
        while (1) {
-               tick_nohz_stop_sched_tick(1);
+               tick_nohz_idle_enter();
                while (!need_resched()) {
 
                        rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
                        enter_idle();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
+
+                       /* enter_idle() needs rcu for notifiers */
+                       rcu_idle_enter();
+
                        if (cpuidle_idle_call())
                                pm_idle();
+
+                       rcu_idle_exit();
                        start_critical_timings();
 
                        /* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
                        __exit_idle();
                }
 
-               tick_nohz_restart_sched_tick();
+               tick_nohz_idle_exit();
                preempt_enable_no_resched();
                schedule();
                preempt_disable();
@@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
        if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
-               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+                                                 IO_BITMAP_BYTES, GFP_KERNEL);
                if (!p->thread.io_bitmap_ptr) {
                        p->thread.io_bitmap_max = 0;
                        return -ENOMEM;
                }
-               memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
-                               IO_BITMAP_BYTES);
                set_tsk_thread_flag(p, TIF_IO_BITMAP);
        }
 
index 8252879..89a04c7 100644 (file)
@@ -749,7 +749,8 @@ put:
 /*
  * Handle PTRACE_POKEUSR calls for the debug register area.
  */
-int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+                              unsigned long val)
 {
        struct thread_struct *thread = &(tsk->thread);
        int rc = 0;
index cf0ef98..d05444a 100644 (file)
@@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)
 static void __init reserve_brk(void)
 {
        if (_brk_end > _brk_start)
-               memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
+               memblock_reserve(__pa(_brk_start),
+                                __pa(_brk_end) - __pa(_brk_start));
 
        /* Mark brk area as locked down and no longer taking any
           new allocations */
@@ -331,13 +332,13 @@ static void __init relocate_initrd(void)
        ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
                                         PAGE_SIZE);
 
-       if (ramdisk_here == MEMBLOCK_ERROR)
+       if (!ramdisk_here)
                panic("Cannot find place for new RAMDISK of size %lld\n",
                         ramdisk_size);
 
        /* Note: this includes all the lowmem currently occupied by
           the initrd, we rely on that fact to keep the data intact. */
-       memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
+       memblock_reserve(ramdisk_here, area_size);
        initrd_start = ramdisk_here + PAGE_OFFSET;
        initrd_end   = initrd_start + ramdisk_size;
        printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -393,7 +394,7 @@ static void __init reserve_initrd(void)
        initrd_start = 0;
 
        if (ramdisk_size >= (end_of_lowmem>>1)) {
-               memblock_x86_free_range(ramdisk_image, ramdisk_end);
+               memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
                printk(KERN_ERR "initrd too large to handle, "
                       "disabling initrd\n");
                return;
@@ -416,7 +417,7 @@ static void __init reserve_initrd(void)
 
        relocate_initrd();
 
-       memblock_x86_free_range(ramdisk_image, ramdisk_end);
+       memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
 }
 #else
 static void __init reserve_initrd(void)
@@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 {
        struct setup_data *data;
        u64 pa_data;
-       char buf[32];
 
        if (boot_params.hdr.version < 0x0209)
                return;
        pa_data = boot_params.hdr.setup_data;
        while (pa_data) {
                data = early_memremap(pa_data, sizeof(*data));
-               sprintf(buf, "setup data %x", data->type);
-               memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
+               memblock_reserve(pa_data, sizeof(*data) + data->len);
                pa_data = data->next;
                early_iounmap(data, sizeof(*data));
        }
@@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)
                crash_base = memblock_find_in_range(alignment,
                               CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
 
-               if (crash_base == MEMBLOCK_ERROR) {
+               if (!crash_base) {
                        pr_info("crashkernel reservation failed - No suitable area found.\n");
                        return;
                }
@@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)
                        return;
                }
        }
-       memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
+       memblock_reserve(crash_base, crash_size);
 
        printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
                        "for crashkernel (System RAM: %ldMB)\n",
@@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)
        addr = find_ibft_region(&size);
 
        if (size)
-               memblock_x86_reserve_range(addr, addr + size, "* ibft");
+               memblock_reserve(addr, size);
 }
 
 static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
index 9f548cb..e38e217 100644 (file)
@@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
        pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
 
        if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
-           !physid_isset(apicid, phys_cpu_present_map)) {
+           !physid_isset(apicid, phys_cpu_present_map) ||
+           (!x2apic_mode && apicid >= 255)) {
                printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
                return -EINVAL;
        }
index a91ae77..a73b610 100644 (file)
@@ -14,11 +14,11 @@ void __init setup_trampolines(void)
 
        /* Has to be in very low memory so we can execute real-mode AP code. */
        mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-       if (mem == MEMBLOCK_ERROR)
+       if (!mem)
                panic("Cannot allocate trampoline\n");
 
        x86_trampoline_base = __va(mem);
-       memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
+       memblock_reserve(mem, size);
 
        printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
               x86_trampoline_base, (unsigned long long)mem, size);
index a8e3eb8..fa1191f 100644 (file)
@@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
                        == NOTIFY_STOP)
                return;
 #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
        if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
                        == NOTIFY_STOP)
                return;
-#else
-       if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
-                       == NOTIFY_STOP)
-               return;
-#endif
 
        preempt_conditional_sti(regs);
        do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
index db48336..2c9cf0f 100644 (file)
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
    erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
 
-static int tsc_clocksource_reliable;
+int tsc_clocksource_reliable;
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
 }
 
 #define CAL_MS         10
-#define CAL_LATCH      (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_LATCH      (PIT_TICK_RATE / (1000 / CAL_MS))
 #define CAL_PIT_LOOPS  1000
 
 #define CAL2_MS                50
-#define CAL2_LATCH     (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_LATCH     (PIT_TICK_RATE / (1000 / CAL2_MS))
 #define CAL2_PIT_LOOPS 5000
 
 
index 0aa5fed..9eba29b 100644 (file)
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
        if (unsynchronized_tsc())
                return;
 
-       if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+       if (tsc_clocksource_reliable) {
                if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
                        pr_info(
                        "Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
 {
        int cpus = 2;
 
-       if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+       if (unsynchronized_tsc() || tsc_clocksource_reliable)
                return;
 
        /*
index e4d4a22..b07ba93 100644 (file)
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
        .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
 
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
 static int __init vsyscall_setup(char *str)
 {
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
        return nr;
 }
 
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+       /*
+        * XXX: if access_ok, get_user, and put_user handled
+        * sig_on_uaccess_error, this could go away.
+        */
+
+       if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+               siginfo_t info;
+               struct thread_struct *thread = &current->thread;
+
+               thread->error_code      = 6;  /* user fault, no page, write */
+               thread->cr2             = ptr;
+               thread->trap_no         = 14;
+
+               memset(&info, 0, sizeof(info));
+               info.si_signo           = SIGSEGV;
+               info.si_errno           = 0;
+               info.si_code            = SEGV_MAPERR;
+               info.si_addr            = (void __user *)ptr;
+
+               force_sig_info(SIGSEGV, &info, current);
+               return false;
+       } else {
+               return true;
+       }
+}
+
 bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
        struct task_struct *tsk;
        unsigned long caller;
        int vsyscall_nr;
+       int prev_sig_on_uaccess_error;
        long ret;
 
        /*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
        if (seccomp_mode(&tsk->seccomp))
                do_exit(SIGKILL);
 
+       /*
+        * With a real vsyscall, page faults cause SIGSEGV.  We want to
+        * preserve that behavior to make writing exploits harder.
+        */
+       prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+       current_thread_info()->sig_on_uaccess_error = 1;
+
+       /*
+        * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+        * 64-bit, so we don't need to special-case it here.  For all the
+        * vsyscalls, 0 means "don't write anything" not "write it at
+        * address 0".
+        */
+       ret = -EFAULT;
        switch (vsyscall_nr) {
        case 0:
+               if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+                   !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+                       break;
+
                ret = sys_gettimeofday(
                        (struct timeval __user *)regs->di,
                        (struct timezone __user *)regs->si);
                break;
 
        case 1:
+               if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+                       break;
+
                ret = sys_time((time_t __user *)regs->di);
                break;
 
        case 2:
+               if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+                   !write_ok_or_segv(regs->si, sizeof(unsigned)))
+                       break;
+
                ret = sys_getcpu((unsigned __user *)regs->di,
                                 (unsigned __user *)regs->si,
                                 0);
                break;
        }
 
+       current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
        if (ret == -EFAULT) {
-               /*
-                * Bad news -- userspace fed a bad pointer to a vsyscall.
-                *
-                * With a real vsyscall, that would have caused SIGSEGV.
-                * To make writing reliable exploits using the emulated
-                * vsyscalls harder, generate SIGSEGV here as well.
-                */
+               /* Bad news -- userspace fed a bad pointer to a vsyscall. */
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall fault (exploit attempt?)");
-               goto sigsegv;
+
+               /*
+                * If we failed to generate a signal for any reason,
+                * generate one here.  (This should be impossible.)
+                */
+               if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+                                !sigismember(&tsk->pending.signal, SIGSEGV)))
+                       goto sigsegv;
+
+               return true;  /* Don't emulate the ret. */
        }
 
        regs->ax = ret;
index c1d6cd5..91f83e2 100644 (file)
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
 
 struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
        .setup_percpu_clockev           = setup_secondary_APIC_clock,
+       .fixup_cpu_id                   = x86_default_fixup_cpu_id,
 };
 
 static void default_nmi_init(void) { };
index 46fc4ee..88ad5fb 100644 (file)
@@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
        const insn_attr_t *table;
        if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
                return 0;
-       table = inat_avx_tables[vex_m][vex_p];
+       /* At first, this checks the master table */
+       table = inat_avx_tables[vex_m][0];
        if (!table)
                return 0;
+       if (!inat_is_group(table[opcode]) && vex_p) {
+               /* If this is not a group, get attribute directly */
+               table = inat_avx_tables[vex_m][vex_p];
+               if (!table)
+                       return 0;
+       }
        return table[opcode];
 }
 
index 374562e..5a1f9f3 100644 (file)
@@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn)
                m = insn_vex_m_bits(insn);
                p = insn_vex_p_bits(insn);
                insn->attr = inat_get_avx_attribute(op, m, p);
-               if (!inat_accept_vex(insn->attr))
+               if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
                        insn->attr = 0; /* This instruction is bad */
                goto end;       /* VEX has only 1 byte for opcode */
        }
@@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn)
                        pfx = insn_last_prefix(insn);
                        insn->attr = inat_get_group_attribute(mod, pfx,
                                                              insn->attr);
+                       if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+                               insn->attr = 0; /* This is bad */
                }
        }
 
index 82004d2..bd59090 100644 (file)
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
 size_t strlen(const char *s)
 {
        int d0;
-       int res;
+       size_t res;
        asm volatile("repne\n\t"
-               "scasb\n\t"
-               "notl %0\n\t"
-               "decl %0"
+               "scasb"
                : "=c" (res), "=&D" (d0)
                : "1" (s), "a" (0), "0" (0xffffffffu)
                : "memory");
-       return res;
+       return ~res - 1;
 }
 EXPORT_SYMBOL(strlen);
 #endif
index a793da5..5b83c51 100644 (file)
@@ -1,5 +1,11 @@
 # x86 Opcode Maps
 #
+# This is (mostly) based on following documentations.
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
+#   (#325383-040US, October 2011)
+# - Intel(R) Advanced Vector Extensions Programming Reference
+#   (#319433-011,JUNE 2011).
+#
 #<Opcode maps>
 # Table: table-name
 # Referrer: escaped-name
 # EndTable
 #
 # AVX Superscripts
-#  (VEX): this opcode can accept VEX prefix.
-#  (oVEX): this opcode requires VEX prefix.
-#  (o128): this opcode only supports 128bit VEX.
-#  (o256): this opcode only supports 256bit VEX.
+#  (v): this opcode requires VEX prefix.
+#  (v1): this opcode only supports 128bit VEX.
+#
+# Last Prefix Superscripts
+#  - (66): the last prefix is 0x66
+#  - (F3): the last prefix is 0xF3
+#  - (F2): the last prefix is 0xF2
 #
 
 Table: one byte opcode
@@ -199,8 +208,8 @@ a0: MOV AL,Ob
 a1: MOV rAX,Ov
 a2: MOV Ob,AL
 a3: MOV Ov,rAX
-a4: MOVS/B Xb,Yb
-a5: MOVS/W/D/Q Xv,Yv
+a4: MOVS/B Yb,Xb
+a5: MOVS/W/D/Q Yv,Xv
 a6: CMPS/B Xb,Yb
 a7: CMPS/W/D Xv,Yv
 a8: TEST AL,Ib
@@ -233,8 +242,8 @@ c0: Grp2 Eb,Ib (1A)
 c1: Grp2 Ev,Ib (1A)
 c2: RETN Iw (f64)
 c3: RETN
-c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
-c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
 c6: Grp11 Eb,Ib (1A)
 c7: Grp11 Ev,Iz (1A)
 c8: ENTER Iw,Ib
@@ -320,14 +329,19 @@ AVXcode: 1
 # 3DNow! uses the last imm byte as opcode extension.
 0f: 3DNow! Pq,Qq,Ib
 # 0x0f 0x10-0x1f
-10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
-11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
-12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
-13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
-14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
-15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
-16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
-17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
+# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
+# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
+# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
+# Reference A.1
+10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
+11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
+12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
+13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
+14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
+15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
+16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
+17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
 18: Grp16 (1A)
 19:
 1a:
@@ -345,14 +359,14 @@ AVXcode: 1
 25:
 26:
 27:
-28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
-29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
-2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
-2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
-2c: cvttps2pi Ppi,Wps | cvttss2si  Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd  Vsd,Wsd (66),(VEX),(o128)
-2f: comiss Vss,Wss (VEX),(o128) | comisd  Vsd,Wsd (66),(VEX),(o128)
+28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
+29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
+2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
+2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
+2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
+2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
+2e: vucomiss Vss,Wss (v1) | vucomisd  Vsd,Wsd (66),(v1)
+2f: vcomiss Vss,Wss (v1) | vcomisd  Vsd,Wsd (66),(v1)
 # 0x0f 0x30-0x3f
 30: WRMSR
 31: RDTSC
@@ -388,65 +402,66 @@ AVXcode: 1
 4e: CMOVLE/NG Gv,Ev
 4f: CMOVNLE/G Gv,Ev
 # 0x0f 0x50-0x5f
-50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
-51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
-52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
-53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
-54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
-55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
-56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
-57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
-58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
-59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
-5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
-5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
-5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
-5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
-5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
-5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
+51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
+52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
+53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
+54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
+55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
+56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
+57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
+58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
+59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
+5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
+5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
+5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
+5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
+5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
+5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
 # 0x0f 0x60-0x6f
-60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
-61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
-62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
-63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
-64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
-65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
-66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
-67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
-68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
-69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
-6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
-6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
-6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
-6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
-6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
-6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
+61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
+62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
+63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
+64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
+65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
+66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
+67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
+68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
+69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
+6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
+6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
+6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
+6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
+6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
+6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
 # 0x0f 0x70-0x7f
-70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
 71: Grp12 (1A)
 72: Grp13 (1A)
 73: Grp14 (1A)
-74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
-75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
-76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
-77: emms/vzeroupper/vzeroall (VEX)
-78: VMREAD Ed/q,Gd/q
-79: VMWRITE Gd/q,Ed/q
+74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
+75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
+76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
+# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
+77: emms | vzeroupper | vzeroall
+78: VMREAD Ey,Gy
+79: VMWRITE Gy,Ey
 7a:
 7b:
-7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
-7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
-7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
-7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
+7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
+7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
+7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
 # 0x0f 0x80-0x8f
 80: JO Jz (f64)
 81: JNO Jz (f64)
-82: JB/JNAE/JC Jz (f64)
-83: JNB/JAE/JNC Jz (f64)
-84: JZ/JE Jz (f64)
-85: JNZ/JNE Jz (f64)
+82: JB/JC/JNAE Jz (f64)
+83: JAE/JNB/JNC Jz (f64)
+84: JE/JZ Jz (f64)
+85: JNE/JNZ Jz (f64)
 86: JBE/JNA Jz (f64)
-87: JNBE/JA Jz (f64)
+87: JA/JNBE Jz (f64)
 88: JS Jz (f64)
 89: JNS Jz (f64)
 8a: JP/JPE Jz (f64)
@@ -502,18 +517,18 @@ b8: JMPE | POPCNT Gv,Ev (F3)
 b9: Grp10 (1A)
 ba: Grp8 Ev,Ib (1A)
 bb: BTC Ev,Gv
-bc: BSF Gv,Ev
-bd: BSR Gv,Ev
+bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
 be: MOVSX Gv,Eb
 bf: MOVSX Gv,Ew
 # 0x0f 0xc0-0xcf
 c0: XADD Eb,Gb
 c1: XADD Ev,Gv
-c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
-c3: movnti Md/q,Gd/q
-c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
-c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
-c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
+c3: movnti My,Gy
+c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
+c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
+c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
 c7: Grp9 (1A)
 c8: BSWAP RAX/EAX/R8/R8D
 c9: BSWAP RCX/ECX/R9/R9D
@@ -524,55 +539,55 @@ cd: BSWAP RBP/EBP/R13/R13D
 ce: BSWAP RSI/ESI/R14/R14D
 cf: BSWAP RDI/EDI/R15/R15D
 # 0x0f 0xd0-0xdf
-d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
-d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
-d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
-d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
-d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
-d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
-d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
-d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
-d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
-d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
-da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
-db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
-dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
-dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
-de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
-df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
+d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
+d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
+d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
+d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
+d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
+d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
+d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
+d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
+da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
+db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
+dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
+dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
+de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
+df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
 # 0x0f 0xe0-0xef
-e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
-e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
-e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
-e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
-e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
-e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
-e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
-e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
-e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
-e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
-ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
-eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
-ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
-ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
-ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
-ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
+e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
+e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
+e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
+e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
+e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
+e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
+e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
+e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
+e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
+ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
+eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
+ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
+ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
+ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
+ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
 # 0x0f 0xf0-0xff
-f0: lddqu Vdq,Mdq (F2),(VEX)
-f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
-f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
-f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
-f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
-f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
-f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
-f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
-f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
-f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
-fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
-fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
-fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
-fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
-fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+f0: vlddqu Vx,Mx (F2)
+f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
+f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
+f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
+f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
+f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
+f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
+f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
+f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
+f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
+fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
+fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
+fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
+fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
+fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
 ff:
 EndTable
 
@@ -580,155 +595,193 @@ Table: 3-byte opcode 1 (0x0f 0x38)
 Referrer: 3-byte escape 1
 AVXcode: 2
 # 0x0f 0x38 0x00-0x0f
-00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
-01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
-02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
-03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
-04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
-05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
-06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
-07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
-08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
-09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
-0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
-0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
-0c: Vpermilps /r (66),(oVEX)
-0d: Vpermilpd /r (66),(oVEX)
-0e: vtestps /r (66),(oVEX)
-0f: vtestpd /r (66),(oVEX)
+00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
+01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
+02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
+03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
+04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
+05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
+06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
+07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
+08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
+09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
+0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
+0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
+0c: vpermilps Vx,Hx,Wx (66),(v)
+0d: vpermilpd Vx,Hx,Wx (66),(v)
+0e: vtestps Vx,Wx (66),(v)
+0f: vtestpd Vx,Wx (66),(v)
 # 0x0f 0x38 0x10-0x1f
 10: pblendvb Vdq,Wdq (66)
 11:
 12:
-13:
+13: vcvtph2ps Vx,Wx,Ib (66),(v)
 14: blendvps Vdq,Wdq (66)
 15: blendvpd Vdq,Wdq (66)
-16:
-17: ptest Vdq,Wdq (66),(VEX)
-18: vbroadcastss /r (66),(oVEX)
-19: vbroadcastsd /r (66),(oVEX),(o256)
-1a: vbroadcastf128 /r (66),(oVEX),(o256)
+16: vpermps Vqq,Hqq,Wqq (66),(v)
+17: vptest Vx,Wx (66)
+18: vbroadcastss Vx,Wd (66),(v)
+19: vbroadcastsd Vqq,Wq (66),(v)
+1a: vbroadcastf128 Vqq,Mdq (66),(v)
 1b:
-1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
-1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
-1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
+1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
+1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
 1f:
 # 0x0f 0x38 0x20-0x2f
-20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
-21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
-22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
-23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
-24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
-25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+20: vpmovsxbw Vx,Ux/Mq (66),(v1)
+21: vpmovsxbd Vx,Ux/Md (66),(v1)
+22: vpmovsxbq Vx,Ux/Mw (66),(v1)
+23: vpmovsxwd Vx,Ux/Mq (66),(v1)
+24: vpmovsxwq Vx,Ux/Md (66),(v1)
+25: vpmovsxdq Vx,Ux/Mq (66),(v1)
 26:
 27:
-28: pmuldq Vdq,Wdq (66),(VEX),(o128)
-29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
-2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
-2b: packusdw Vdq,Wdq (66),(VEX),(o128)
-2c: vmaskmovps(ld) /r (66),(oVEX)
-2d: vmaskmovpd(ld) /r (66),(oVEX)
-2e: vmaskmovps(st) /r (66),(oVEX)
-2f: vmaskmovpd(st) /r (66),(oVEX)
+28: vpmuldq Vx,Hx,Wx (66),(v1)
+29: vpcmpeqq Vx,Hx,Wx (66),(v1)
+2a: vmovntdqa Vx,Mx (66),(v1)
+2b: vpackusdw Vx,Hx,Wx (66),(v1)
+2c: vmaskmovps Vx,Hx,Mx (66),(v)
+2d: vmaskmovpd Vx,Hx,Mx (66),(v)
+2e: vmaskmovps Mx,Hx,Vx (66),(v)
+2f: vmaskmovpd Mx,Hx,Vx (66),(v)
 # 0x0f 0x38 0x30-0x3f
-30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
-31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
-32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
-33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
-34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
-35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
-36:
-37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
-38: pminsb Vdq,Wdq (66),(VEX),(o128)
-39: pminsd Vdq,Wdq (66),(VEX),(o128)
-3a: pminuw Vdq,Wdq (66),(VEX),(o128)
-3b: pminud Vdq,Wdq (66),(VEX),(o128)
-3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
-3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
-3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
-3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+30: vpmovzxbw Vx,Ux/Mq (66),(v1)
+31: vpmovzxbd Vx,Ux/Md (66),(v1)
+32: vpmovzxbq Vx,Ux/Mw (66),(v1)
+33: vpmovzxwd Vx,Ux/Mq (66),(v1)
+34: vpmovzxwq Vx,Ux/Md (66),(v1)
+35: vpmovzxdq Vx,Ux/Mq (66),(v1)
+36: vpermd Vqq,Hqq,Wqq (66),(v)
+37: vpcmpgtq Vx,Hx,Wx (66),(v1)
+38: vpminsb Vx,Hx,Wx (66),(v1)
+39: vpminsd Vx,Hx,Wx (66),(v1)
+3a: vpminuw Vx,Hx,Wx (66),(v1)
+3b: vpminud Vx,Hx,Wx (66),(v1)
+3c: vpmaxsb Vx,Hx,Wx (66),(v1)
+3d: vpmaxsd Vx,Hx,Wx (66),(v1)
+3e: vpmaxuw Vx,Hx,Wx (66),(v1)
+3f: vpmaxud Vx,Hx,Wx (66),(v1)
 # 0x0f 0x38 0x40-0x8f
-40: pmulld Vdq,Wdq (66),(VEX),(o128)
-41: phminposuw Vdq,Wdq (66),(VEX),(o128)
-80: INVEPT Gd/q,Mdq (66)
-81: INVPID Gd/q,Mdq (66)
+40: vpmulld Vx,Hx,Wx (66),(v1)
+41: vphminposuw Vdq,Wdq (66),(v1)
+42:
+43:
+44:
+45: vpsrlvd/q Vx,Hx,Wx (66),(v)
+46: vpsravd Vx,Hx,Wx (66),(v)
+47: vpsllvd/q Vx,Hx,Wx (66),(v)
+# Skip 0x48-0x57
+58: vpbroadcastd Vx,Wx (66),(v)
+59: vpbroadcastq Vx,Wx (66),(v)
+5a: vbroadcasti128 Vqq,Mdq (66),(v)
+# Skip 0x5b-0x77
+78: vpbroadcastb Vx,Wx (66),(v)
+79: vpbroadcastw Vx,Wx (66),(v)
+# Skip 0x7a-0x7f
+80: INVEPT Gy,Mdq (66)
+81: INVPID Gy,Mdq (66)
+82: INVPCID Gy,Mdq (66)
+8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
+8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
 # 0x0f 0x38 0x90-0xbf (FMA)
-96: vfmaddsub132pd/ps /r (66),(VEX)
-97: vfmsubadd132pd/ps /r (66),(VEX)
-98: vfmadd132pd/ps /r (66),(VEX)
-99: vfmadd132sd/ss /r (66),(VEX),(o128)
-9a: vfmsub132pd/ps /r (66),(VEX)
-9b: vfmsub132sd/ss /r (66),(VEX),(o128)
-9c: vfnmadd132pd/ps /r (66),(VEX)
-9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
-9e: vfnmsub132pd/ps /r (66),(VEX)
-9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
-a6: vfmaddsub213pd/ps /r (66),(VEX)
-a7: vfmsubadd213pd/ps /r (66),(VEX)
-a8: vfmadd213pd/ps /r (66),(VEX)
-a9: vfmadd213sd/ss /r (66),(VEX),(o128)
-aa: vfmsub213pd/ps /r (66),(VEX)
-ab: vfmsub213sd/ss /r (66),(VEX),(o128)
-ac: vfnmadd213pd/ps /r (66),(VEX)
-ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
-ae: vfnmsub213pd/ps /r (66),(VEX)
-af: vfnmsub213sd/ss /r (66),(VEX),(o128)
-b6: vfmaddsub231pd/ps /r (66),(VEX)
-b7: vfmsubadd231pd/ps /r (66),(VEX)
-b8: vfmadd231pd/ps /r (66),(VEX)
-b9: vfmadd231sd/ss /r (66),(VEX),(o128)
-ba: vfmsub231pd/ps /r (66),(VEX)
-bb: vfmsub231sd/ss /r (66),(VEX),(o128)
-bc: vfnmadd231pd/ps /r (66),(VEX)
-bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
-be: vfnmsub231pd/ps /r (66),(VEX)
-bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+90: vgatherdd/q Vx,Hx,Wx (66),(v)
+91: vgatherqd/q Vx,Hx,Wx (66),(v)
+92: vgatherdps/d Vx,Hx,Wx (66),(v)
+93: vgatherqps/d Vx,Hx,Wx (66),(v)
+94:
+95:
+96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
+97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
+98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
+99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
+9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
+9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
+9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
+a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
+a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
+a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
+ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
+ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
+af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
+b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
+b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
+b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
+bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
+bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
+bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
 # 0x0f 0x38 0xc0-0xff
-db: aesimc Vdq,Wdq (66),(VEX),(o128)
-dc: aesenc Vdq,Wdq (66),(VEX),(o128)
-dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
-de: aesdec Vdq,Wdq (66),(VEX),(o128)
-df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
-f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
-f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+db: VAESIMC Vdq,Wdq (66),(v1)
+dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
+dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
+de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
+df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
+f3: ANDN Gy,By,Ey (v)
+f4: Grp17 (1A)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f6: MULX By,Gy,rDX,Ey (F2),(v)
+f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
 EndTable
 
 Table: 3-byte opcode 2 (0x0f 0x3a)
 Referrer: 3-byte escape 2
 AVXcode: 3
 # 0x0f 0x3a 0x00-0xff
-04: vpermilps /r,Ib (66),(oVEX)
-05: vpermilpd /r,Ib (66),(oVEX)
-06: vperm2f128 /r,Ib (66),(oVEX),(o256)
-08: roundps Vdq,Wdq,Ib (66),(VEX)
-09: roundpd Vdq,Wdq,Ib (66),(VEX)
-0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
-0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
-0c: blendps Vdq,Wdq,Ib (66),(VEX)
-0d: blendpd Vdq,Wdq,Ib (66),(VEX)
-0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
-0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
-14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
-15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
-16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
-17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
-18: vinsertf128 /r,Ib (66),(oVEX),(o256)
-19: vextractf128 /r,Ib (66),(oVEX),(o256)
-20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
-21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
-22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
-40: dpps Vdq,Wdq,Ib (66),(VEX)
-41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
-42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
-44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
-4a: vblendvps /r,Ib (66),(oVEX)
-4b: vblendvpd /r,Ib (66),(oVEX)
-4c: vpblendvb /r,Ib (66),(oVEX),(o128)
-60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
-61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
-62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
-63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
-df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+00: vpermq Vqq,Wqq,Ib (66),(v)
+01: vpermpd Vqq,Wqq,Ib (66),(v)
+02: vpblendd Vx,Hx,Wx,Ib (66),(v)
+03:
+04: vpermilps Vx,Wx,Ib (66),(v)
+05: vpermilpd Vx,Wx,Ib (66),(v)
+06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
+07:
+08: vroundps Vx,Wx,Ib (66)
+09: vroundpd Vx,Wx,Ib (66)
+0a: vroundss Vss,Wss,Ib (66),(v1)
+0b: vroundsd Vsd,Wsd,Ib (66),(v1)
+0c: vblendps Vx,Hx,Wx,Ib (66)
+0d: vblendpd Vx,Hx,Wx,Ib (66)
+0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
+0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
+14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
+15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
+16: vpextrd/q Ey,Vdq,Ib (66),(v1)
+17: vextractps Ed,Vdq,Ib (66),(v1)
+18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
+19: vextractf128 Wdq,Vqq,Ib (66),(v)
+1d: vcvtps2ph Wx,Vx,Ib (66),(v)
+20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
+21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
+22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
+38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
+39: vextracti128 Wdq,Vqq,Ib (66),(v)
+40: vdpps Vx,Hx,Wx,Ib (66)
+41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
+42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
+44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
+46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
+4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
+4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
+4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
+60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
+61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
+62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
+63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
+f0: RORX Gy,Ey,Ib (F2),(v)
 EndTable
 
 GrpTable: Grp1
@@ -790,7 +843,7 @@ GrpTable: Grp5
 2: CALLN Ev (f64)
 3: CALLF Ep
 4: JMPN Ev (f64)
-5: JMPF Ep
+5: JMPF Mp
 6: PUSH Ev (d64)
 7:
 EndTable
@@ -807,7 +860,7 @@ EndTable
 GrpTable: Grp7
 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
-2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
 3: LIDT Ms
 4: SMSW Mw/Rv
 5:
@@ -824,44 +877,45 @@ EndTable
 
 GrpTable: Grp9
 1: CMPXCHG8B/16B Mq/Mdq
-6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
-7: VMPTRST Mq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+7: VMPTRST Mq | VMPTRST Mq (F3)
 EndTable
 
 GrpTable: Grp10
 EndTable
 
 GrpTable: Grp11
+# Note: the operands are given by group opcode
 0: MOV
 EndTable
 
 GrpTable: Grp12
-2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
-4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
-6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
+4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
+6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp13
-2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
-4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
-6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
+4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
+6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp14
-2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
-3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
-6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
-7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
+3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
+6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
+7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
 EndTable
 
 GrpTable: Grp15
-0: fxsave
-1: fxstor
-2: ldmxcsr (VEX)
-3: stmxcsr (VEX)
+0: fxsave | RDFSBASE Ry (F3),(11B)
+1: fxstor | RDGSBASE Ry (F3),(11B)
+2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
+3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
 4: XSAVE
 5: XRSTOR | lfence (11B)
-6: mfence (11B)
+6: XSAVEOPT | mfence (11B)
 7: clflush | sfence (11B)
 EndTable
 
@@ -872,6 +926,12 @@ GrpTable: Grp16
 3: prefetch T2
 EndTable
 
+GrpTable: Grp17
+1: BLSR By,Ey (v)
+2: BLSMSK By,Ey (v)
+3: BLSI By,Ey (v)
+EndTable
+
 # AMD's Prefetch Group
 GrpTable: GrpP
 0: PREFETCH
index 3d11327..23d8e5f 100644 (file)
@@ -27,6 +27,4 @@ obj-$(CONFIG_AMD_NUMA)                += amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)                += srat.o
 obj-$(CONFIG_NUMA_EMU)         += numa_emulation.o
 
-obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
-
 obj-$(CONFIG_MEMTEST)          += memtest.o
index d0474ad..1fb85db 100644 (file)
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
        if (fixup) {
                /* If fixup is less than 16, it means uaccess error */
                if (fixup->fixup < 16) {
-                       current_thread_info()->uaccess_err = -EFAULT;
+                       current_thread_info()->uaccess_err = 1;
                        regs->ip += fixup->fixup;
                        return 1;
                }
index 5db0490..9d74824 100644 (file)
@@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
-          unsigned long address)
+          unsigned long address, int signal, int si_code)
 {
        struct task_struct *tsk = current;
        unsigned long *stackend;
@@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
        int sig;
 
        /* Are we prepared to handle this kernel fault? */
-       if (fixup_exception(regs))
+       if (fixup_exception(regs)) {
+               if (current_thread_info()->sig_on_uaccess_error && signal) {
+                       tsk->thread.trap_no = 14;
+                       tsk->thread.error_code = error_code | PF_USER;
+                       tsk->thread.cr2 = address;
+
+                       /* XXX: hwpoison faults will set the wrong code. */
+                       force_sig_info_fault(signal, si_code, address, tsk, 0);
+               }
                return;
+       }
 
        /*
         * 32-bit:
@@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
        if (is_f00f_bug(regs, address))
                return;
 
-       no_context(regs, error_code, address);
+       no_context(regs, error_code, address, SIGSEGV, si_code);
 }
 
 static noinline void
@@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 
        /* Kernel mode? Handle exceptions or die: */
        if (!(error_code & PF_USER)) {
-               no_context(regs, error_code, address);
+               no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
                return;
        }
 
@@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
                if (!(fault & VM_FAULT_RETRY))
                        up_read(&current->mm->mmap_sem);
                if (!(error_code & PF_USER))
-                       no_context(regs, error_code, address);
+                       no_context(regs, error_code, address, 0, 0);
                return 1;
        }
        if (!(fault & VM_FAULT_ERROR))
@@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
                /* Kernel mode? Handle exceptions or die: */
                if (!(error_code & PF_USER)) {
                        up_read(&current->mm->mmap_sem);
-                       no_context(regs, error_code, address);
+                       no_context(regs, error_code, address,
+                                  SIGSEGV, SEGV_MAPERR);
                        return 1;
                }
 
index 87488b9..a298914 100644 (file)
@@ -67,7 +67,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
        good_end = max_pfn_mapped << PAGE_SHIFT;
 
        base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-       if (base == MEMBLOCK_ERROR)
+       if (!base)
                panic("Cannot find space for the kernel page tables");
 
        pgt_buf_start = base >> PAGE_SHIFT;
@@ -80,7 +80,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 
 void __init native_pagetable_reserve(u64 start, u64 end)
 {
-       memblock_x86_reserve_range(start, end, "PGTABLE");
+       memblock_reserve(start, end - start);
 }
 
 struct map_range {
@@ -279,8 +279,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
         * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
         * so that they can be reused for other purposes.
         *
-        * On native it just means calling memblock_x86_reserve_range, on Xen it
-        * also means marking RW the pagetable pages that we allocated before
+        * On native it just means calling memblock_reserve, on Xen it also
+        * means marking RW the pagetable pages that we allocated before
         * but that haven't been used.
         *
         * In fact on xen we mark RO the whole range pgt_buf_start -
index 29f7c6d..0c1da39 100644 (file)
@@ -427,23 +427,17 @@ static void __init add_one_highpage_init(struct page *page)
 void __init add_highpages_with_active_regions(int nid,
                         unsigned long start_pfn, unsigned long end_pfn)
 {
-       struct range *range;
-       int nr_range;
-       int i;
-
-       nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
-
-       for (i = 0; i < nr_range; i++) {
-               struct page *page;
-               int node_pfn;
-
-               for (node_pfn = range[i].start; node_pfn < range[i].end;
-                    node_pfn++) {
-                       if (!pfn_valid(node_pfn))
-                               continue;
-                       page = pfn_to_page(node_pfn);
-                       add_one_highpage_init(page);
-               }
+       phys_addr_t start, end;
+       u64 i;
+
+       for_each_free_mem_range(i, nid, &start, &end, NULL) {
+               unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+                                           start_pfn, end_pfn);
+               unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+                                             start_pfn, end_pfn);
+               for ( ; pfn < e_pfn; pfn++)
+                       if (pfn_valid(pfn))
+                               add_one_highpage_init(pfn_to_page(pfn));
        }
 }
 #else
@@ -650,18 +644,18 @@ void __init initmem_init(void)
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
                highstart_pfn = max_low_pfn;
-       memblock_x86_register_active_regions(0, 0, highend_pfn);
-       sparse_memory_present_with_active_regions(0);
        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
                pages_to_mb(highend_pfn - highstart_pfn));
        num_physpages = highend_pfn;
        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-       memblock_x86_register_active_regions(0, 0, max_low_pfn);
-       sparse_memory_present_with_active_regions(0);
        num_physpages = max_low_pfn;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
+
+       memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+       sparse_memory_present_with_active_regions(0);
+
 #ifdef CONFIG_FLATMEM
        max_mapnr = num_physpages;
 #endif
index bbaaa00..a8a56ce 100644 (file)
@@ -608,7 +608,7 @@ kernel_physical_mapping_init(unsigned long start,
 #ifndef CONFIG_NUMA
 void __init initmem_init(void)
 {
-       memblock_x86_register_active_regions(0, 0, max_pfn);
+       memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
 }
 #endif
 
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
deleted file mode 100644 (file)
index 992da5e..0000000
+++ /dev/null
@@ -1,348 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <linux/memblock.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/range.h>
-
-/* Check for already reserved areas */
-bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
-{
-       struct memblock_region *r;
-       u64 addr = *addrp, last;
-       u64 size = *sizep;
-       bool changed = false;
-
-again:
-       last = addr + size;
-       for_each_memblock(reserved, r) {
-               if (last > r->base && addr < r->base) {
-                       size = r->base - addr;
-                       changed = true;
-                       goto again;
-               }
-               if (last > (r->base + r->size) && addr < (r->base + r->size)) {
-                       addr = round_up(r->base + r->size, align);
-                       size = last - addr;
-                       changed = true;
-                       goto again;
-               }
-               if (last <= (r->base + r->size) && addr >= r->base) {
-                       *sizep = 0;
-                       return false;
-               }
-       }
-       if (changed) {
-               *addrp = addr;
-               *sizep = size;
-       }
-       return changed;
-}
-
-/*
- * Find next free range after start, and size is returned in *sizep
- */
-u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
-{
-       struct memblock_region *r;
-
-       for_each_memblock(memory, r) {
-               u64 ei_start = r->base;
-               u64 ei_last = ei_start + r->size;
-               u64 addr;
-
-               addr = round_up(ei_start, align);
-               if (addr < start)
-                       addr = round_up(start, align);
-               if (addr >= ei_last)
-                       continue;
-               *sizep = ei_last - addr;
-               while (memblock_x86_check_reserved_size(&addr, sizep, align))
-                       ;
-
-               if (*sizep)
-                       return addr;
-       }
-
-       return MEMBLOCK_ERROR;
-}
-
-static __init struct range *find_range_array(int count)
-{
-       u64 end, size, mem;
-       struct range *range;
-
-       size = sizeof(struct range) * count;
-       end = memblock.current_limit;
-
-       mem = memblock_find_in_range(0, end, size, sizeof(struct range));
-       if (mem == MEMBLOCK_ERROR)
-               panic("can not find more space for range array");
-
-       /*
-        * This range is tempoaray, so don't reserve it, it will not be
-        * overlapped because We will not alloccate new buffer before
-        * We discard this one
-        */
-       range = __va(mem);
-       memset(range, 0, size);
-
-       return range;
-}
-
-static void __init memblock_x86_subtract_reserved(struct range *range, int az)
-{
-       u64 final_start, final_end;
-       struct memblock_region *r;
-
-       /* Take out region array itself at first*/
-       memblock_free_reserved_regions();
-
-       memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
-
-       for_each_memblock(reserved, r) {
-               memblock_dbg("  [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
-               final_start = PFN_DOWN(r->base);
-               final_end = PFN_UP(r->base + r->size);
-               if (final_start >= final_end)
-                       continue;
-               subtract_range(range, az, final_start, final_end);
-       }
-
-       /* Put region array back ? */
-       memblock_reserve_reserved_regions();
-}
-
-struct count_data {
-       int nr;
-};
-
-static int __init count_work_fn(unsigned long start_pfn,
-                               unsigned long end_pfn, void *datax)
-{
-       struct count_data *data = datax;
-
-       data->nr++;
-
-       return 0;
-}
-
-static int __init count_early_node_map(int nodeid)
-{
-       struct count_data data;
-
-       data.nr = 0;
-       work_with_active_regions(nodeid, count_work_fn, &data);
-
-       return data.nr;
-}
-
-int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
-                        unsigned long start_pfn, unsigned long end_pfn)
-{
-       int count;
-       struct range *range;
-       int nr_range;
-
-       count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
-
-       range = find_range_array(count);
-       nr_range = 0;
-
-       /*
-        * Use early_node_map[] and memblock.reserved.region to get range array
-        * at first
-        */
-       nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
-       subtract_range(range, count, 0, start_pfn);
-       subtract_range(range, count, end_pfn, -1ULL);
-
-       memblock_x86_subtract_reserved(range, count);
-       nr_range = clean_sort_range(range, count);
-
-       *rangep = range;
-       return nr_range;
-}
-
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
-       unsigned long end_pfn = -1UL;
-
-#ifdef CONFIG_X86_32
-       end_pfn = max_low_pfn;
-#endif
-       return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
-}
-
-static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
-{
-       int i, count;
-       struct range *range;
-       int nr_range;
-       u64 final_start, final_end;
-       u64 free_size;
-       struct memblock_region *r;
-
-       count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
-
-       range = find_range_array(count);
-       nr_range = 0;
-
-       addr = PFN_UP(addr);
-       limit = PFN_DOWN(limit);
-
-       for_each_memblock(memory, r) {
-               final_start = PFN_UP(r->base);
-               final_end = PFN_DOWN(r->base + r->size);
-               if (final_start >= final_end)
-                       continue;
-               if (final_start >= limit || final_end <= addr)
-                       continue;
-
-               nr_range = add_range(range, count, nr_range, final_start, final_end);
-       }
-       subtract_range(range, count, 0, addr);
-       subtract_range(range, count, limit, -1ULL);
-
-       /* Subtract memblock.reserved.region in range ? */
-       if (!get_free)
-               goto sort_and_count_them;
-       for_each_memblock(reserved, r) {
-               final_start = PFN_DOWN(r->base);
-               final_end = PFN_UP(r->base + r->size);
-               if (final_start >= final_end)
-                       continue;
-               if (final_start >= limit || final_end <= addr)
-                       continue;
-
-               subtract_range(range, count, final_start, final_end);
-       }
-
-sort_and_count_them:
-       nr_range = clean_sort_range(range, count);
-
-       free_size = 0;
-       for (i = 0; i < nr_range; i++)
-               free_size += range[i].end - range[i].start;
-
-       return free_size << PAGE_SHIFT;
-}
-
-u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
-{
-       return __memblock_x86_memory_in_range(addr, limit, true);
-}
-
-u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
-{
-       return __memblock_x86_memory_in_range(addr, limit, false);
-}
-
-void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
-{
-       if (start == end)
-               return;
-
-       if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
-               return;
-
-       memblock_dbg("    memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
-
-       memblock_reserve(start, end - start);
-}
-
-void __init memblock_x86_free_range(u64 start, u64 end)
-{
-       if (start == end)
-               return;
-
-       if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
-               return;
-
-       memblock_dbg("       memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
-
-       memblock_free(start, end - start);
-}
-
-/*
- * Need to call this function after memblock_x86_register_active_regions,
- * so early_node_map[] is filled already.
- */
-u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
-{
-       u64 addr;
-       addr = find_memory_core_early(nid, size, align, start, end);
-       if (addr != MEMBLOCK_ERROR)
-               return addr;
-
-       /* Fallback, should already have start end within node range */
-       return memblock_find_in_range(start, end, size, align);
-}
-
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
- */
-static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
-                                 unsigned long start_pfn,
-                                 unsigned long last_pfn,
-                                 unsigned long *ei_startpfn,
-                                 unsigned long *ei_endpfn)
-{
-       u64 align = PAGE_SIZE;
-
-       *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
-       *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
-
-       /* Skip map entries smaller than a page */
-       if (*ei_startpfn >= *ei_endpfn)
-               return 0;
-
-       /* Skip if map is outside the node */
-       if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
-               return 0;
-
-       /* Check for overlaps */
-       if (*ei_startpfn < start_pfn)
-               *ei_startpfn = start_pfn;
-       if (*ei_endpfn > last_pfn)
-               *ei_endpfn = last_pfn;
-
-       return 1;
-}
-
-/* Walk the memblock.memory map and register active regions within a node */
-void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
-                                        unsigned long last_pfn)
-{
-       unsigned long ei_startpfn;
-       unsigned long ei_endpfn;
-       struct memblock_region *r;
-
-       for_each_memblock(memory, r)
-               if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
-                                          &ei_startpfn, &ei_endpfn))
-                       add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init memblock_x86_hole_size(u64 start, u64 end)
-{
-       unsigned long start_pfn = start >> PAGE_SHIFT;
-       unsigned long last_pfn = end >> PAGE_SHIFT;
-       unsigned long ei_startpfn, ei_endpfn, ram = 0;
-       struct memblock_region *r;
-
-       for_each_memblock(memory, r)
-               if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
-                                          &ei_startpfn, &ei_endpfn))
-                       ram += ei_endpfn - ei_startpfn;
-
-       return end - start - ((u64)ram << PAGE_SHIFT);
-}
index 92faf3a..c80b9fb 100644 (file)
@@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
               (unsigned long long) pattern,
               (unsigned long long) start_bad,
               (unsigned long long) end_bad);
-       memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
+       memblock_reserve(start_bad, end_bad - start_bad);
 }
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -70,24 +70,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 
 static void __init do_one_pass(u64 pattern, u64 start, u64 end)
 {
-       u64 size = 0;
-
-       while (start < end) {
-               start = memblock_x86_find_in_range_size(start, &size, 1);
-
-               /* done ? */
-               if (start >= end)
-                       break;
-               if (start + size > end)
-                       size = end - start;
-
-               printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
-                      (unsigned long long) start,
-                      (unsigned long long) start + size,
-                      (unsigned long long) cpu_to_be64(pattern));
-               memtest(pattern, start, size);
-
-               start += size;
+       u64 i;
+       phys_addr_t this_start, this_end;
+
+       for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+               this_start = clamp_t(phys_addr_t, this_start, start, end);
+               this_end = clamp_t(phys_addr_t, this_end, start, end);
+               if (this_start < this_end) {
+                       printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
+                              (unsigned long long)this_start,
+                              (unsigned long long)this_end,
+                              (unsigned long long)cpu_to_be64(pattern));
+                       memtest(pattern, this_start, this_end - this_start);
+               }
        }
 }
 
index fbeaaf4..496f494 100644 (file)
@@ -192,8 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 /* Initialize NODE_DATA for a node on the local memory */
 static void __init setup_node_data(int nid, u64 start, u64 end)
 {
-       const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
-       const u64 nd_high = PFN_PHYS(max_pfn_mapped);
        const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        bool remapped = false;
        u64 nd_pa;
@@ -224,17 +222,12 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
                nd_pa = __pa(nd);
                remapped = true;
        } else {
-               nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
-                                               nd_size, SMP_CACHE_BYTES);
-               if (nd_pa == MEMBLOCK_ERROR)
-                       nd_pa = memblock_find_in_range(nd_low, nd_high,
-                                               nd_size, SMP_CACHE_BYTES);
-               if (nd_pa == MEMBLOCK_ERROR) {
+               nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+               if (!nd_pa) {
                        pr_err("Cannot find %zu bytes in node %d\n",
                               nd_size, nid);
                        return;
                }
-               memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
                nd = __va(nd_pa);
        }
 
@@ -371,8 +364,7 @@ void __init numa_reset_distance(void)
 
        /* numa_distance could be 1LU marking allocation failure, test cnt */
        if (numa_distance_cnt)
-               memblock_x86_free_range(__pa(numa_distance),
-                                       __pa(numa_distance) + size);
+               memblock_free(__pa(numa_distance), size);
        numa_distance_cnt = 0;
        numa_distance = NULL;   /* enable table creation */
 }
@@ -395,13 +387,13 @@ static int __init numa_alloc_distance(void)
 
        phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
                                      size, PAGE_SIZE);
-       if (phys == MEMBLOCK_ERROR) {
+       if (!phys) {
                pr_warning("NUMA: Warning: can't allocate distance table!\n");
                /* don't retry until explicitly reset */
                numa_distance = (void *)1LU;
                return -ENOMEM;
        }
-       memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+       memblock_reserve(phys, size);
 
        numa_distance = __va(phys);
        numa_distance_cnt = cnt;
@@ -482,8 +474,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
                        numaram = 0;
        }
 
-       e820ram = max_pfn - (memblock_x86_hole_size(0,
-                                       PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
+       e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
        if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
                printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
@@ -505,13 +497,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
        if (WARN_ON(nodes_empty(node_possible_map)))
                return -EINVAL;
 
-       for (i = 0; i < mi->nr_blks; i++)
-               memblock_x86_register_active_regions(mi->blk[i].nid,
-                                       mi->blk[i].start >> PAGE_SHIFT,
-                                       mi->blk[i].end >> PAGE_SHIFT);
-
-       /* for out of order entries */
-       sort_node_map();
+       for (i = 0; i < mi->nr_blks; i++) {
+               struct numa_memblk *mb = &mi->blk[i];
+               memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+       }
 
        /*
         * If sections array is gonna be used for pfn -> nid mapping, check
@@ -545,6 +534,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
                        setup_node_data(nid, start, end);
        }
 
+       /* Dump memblock with node info and return. */
+       memblock_dump_all();
        return 0;
 }
 
@@ -582,7 +573,7 @@ static int __init numa_init(int (*init_func)(void))
        nodes_clear(node_possible_map);
        nodes_clear(node_online_map);
        memset(&numa_meminfo, 0, sizeof(numa_meminfo));
-       remove_all_active_ranges();
+       WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
        numa_reset_distance();
 
        ret = init_func();
index 3adebe7..534255a 100644 (file)
@@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end)
 
        /* allocate node memory and the lowmem remap area */
        node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
-       if (node_pa == MEMBLOCK_ERROR) {
+       if (!node_pa) {
                pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
                           size, nid);
                return;
        }
-       memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+       memblock_reserve(node_pa, size);
 
        remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
                                          max_low_pfn << PAGE_SHIFT,
                                          size, LARGE_PAGE_BYTES);
-       if (remap_pa == MEMBLOCK_ERROR) {
+       if (!remap_pa) {
                pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
                           size, nid);
-               memblock_x86_free_range(node_pa, node_pa + size);
+               memblock_free(node_pa, size);
                return;
        }
-       memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+       memblock_reserve(remap_pa, size);
        remap_va = phys_to_virt(remap_pa);
 
        /* perform actual remap */
index dd27f40..92e2711 100644 (file)
@@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
        for_each_online_node(i)
                pages += free_all_bootmem_node(NODE_DATA(i));
 
-       pages += free_all_memory_core_early(MAX_NUMNODES);
+       pages += free_low_memory_core_early(MAX_NUMNODES);
 
        return pages;
 }
index d0ed086..46db568 100644 (file)
@@ -28,6 +28,16 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
        return -ENOENT;
 }
 
+static u64 mem_hole_size(u64 start, u64 end)
+{
+       unsigned long start_pfn = PFN_UP(start);
+       unsigned long end_pfn = PFN_DOWN(end);
+
+       if (start_pfn < end_pfn)
+               return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+       return 0;
+}
+
 /*
  * Sets up nid to range from @start to @end.  The return value is -errno if
  * something went wrong, 0 otherwise.
@@ -89,7 +99,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
         * Calculate target node size.  x86_32 freaks on __udivdi3() so do
         * the division in ulong number of pages and convert back.
         */
-       size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+       size = max_addr - addr - mem_hole_size(addr, max_addr);
        size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
 
        /*
@@ -135,8 +145,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
                         * Continue to add memory to this fake node if its
                         * non-reserved memory is less than the per-node size.
                         */
-                       while (end - start -
-                              memblock_x86_hole_size(start, end) < size) {
+                       while (end - start - mem_hole_size(start, end) < size) {
                                end += FAKE_NODE_MIN_SIZE;
                                if (end > limit) {
                                        end = limit;
@@ -150,7 +159,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
                         * this one must extend to the boundary.
                         */
                        if (end < dma32_end && dma32_end - end -
-                           memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                           mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
                                end = dma32_end;
 
                        /*
@@ -158,8 +167,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
                         * next node, this one must extend to the end of the
                         * physical node.
                         */
-                       if (limit - end -
-                           memblock_x86_hole_size(end, limit) < size)
+                       if (limit - end - mem_hole_size(end, limit) < size)
                                end = limit;
 
                        ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
@@ -180,7 +188,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 {
        u64 end = start + size;
 
-       while (end - start - memblock_x86_hole_size(start, end) < size) {
+       while (end - start - mem_hole_size(start, end) < size) {
                end += FAKE_NODE_MIN_SIZE;
                if (end > max_addr) {
                        end = max_addr;
@@ -211,8 +219,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
         * creates a uniform distribution of node sizes across the entire
         * machine (but not necessarily over physical nodes).
         */
-       min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
-                                               MAX_NUMNODES;
+       min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
@@ -252,7 +259,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
                         * this one must extend to the boundary.
                         */
                        if (end < dma32_end && dma32_end - end -
-                           memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+                           mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
                                end = dma32_end;
 
                        /*
@@ -260,8 +267,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
                         * next node, this one must extend to the end of the
                         * physical node.
                         */
-                       if (limit - end -
-                           memblock_x86_hole_size(end, limit) < size)
+                       if (limit - end - mem_hole_size(end, limit) < size)
                                end = limit;
 
                        ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -351,11 +357,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 
                phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
                                              phys_size, PAGE_SIZE);
-               if (phys == MEMBLOCK_ERROR) {
+               if (!phys) {
                        pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
                        goto no_emu;
                }
-               memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
+               memblock_reserve(phys, phys_size);
                phys_dist = __va(phys);
 
                for (i = 0; i < numa_dist_cnt; i++)
@@ -424,7 +430,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 
        /* free the copied physical distance table */
        if (phys_dist)
-               memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
+               memblock_free(__pa(phys_dist), phys_size);
        return;
 
 no_emu:
index f9e5267..eda2acb 100644 (file)
@@ -998,7 +998,7 @@ out_err:
 }
 EXPORT_SYMBOL(set_memory_uc);
 
-int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int addrinarray,
                unsigned long new_type)
 {
        int i, j;
index 81dbfde..fd61b3f 100644 (file)
@@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
                return;
        pxm = pa->proximity_domain;
+       apic_id = pa->apic_id;
+       if (!cpu_has_x2apic && (apic_id >= 0xff)) {
+               printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+                        pxm, apic_id);
+               return;
+       }
        node = setup_node(pxm);
        if (node < 0) {
                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
                return;
        }
 
-       apic_id = pa->apic_id;
        if (apic_id >= MAX_LOCAL_APIC) {
                printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
                return;
index 446902b..1599f56 100644 (file)
@@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
                oprof.o cpu_buffer.o buffer_sync.o \
                event_buffer.o oprofile_files.o \
                oprofilefs.o oprofile_stats.o  \
-               timer_int.o )
+               timer_int.o nmi_timer_int.o )
 
 oprofile-y                             := $(DRIVER_OBJS) init.o backtrace.o
 oprofile-$(CONFIG_X86_LOCAL_APIC)      += nmi_int.o op_model_amd.o \
                                           op_model_ppro.o op_model_p4.o
-oprofile-$(CONFIG_X86_IO_APIC)         += nmi_timer_int.o
index f148cf6..9e138d0 100644 (file)
  * with the NMI mode driver.
  */
 
+#ifdef CONFIG_X86_LOCAL_APIC
 extern int op_nmi_init(struct oprofile_operations *ops);
-extern int op_nmi_timer_init(struct oprofile_operations *ops);
 extern void op_nmi_exit(void);
-extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
+#else
+static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
+static void op_nmi_exit(void) { }
+#endif
 
-static int nmi_timer;
+extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
 
 int __init oprofile_arch_init(struct oprofile_operations *ops)
 {
-       int ret;
-
-       ret = -ENODEV;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       ret = op_nmi_init(ops);
-#endif
-       nmi_timer = (ret != 0);
-#ifdef CONFIG_X86_IO_APIC
-       if (nmi_timer)
-               ret = op_nmi_timer_init(ops);
-#endif
        ops->backtrace = x86_backtrace;
-
-       return ret;
+       return op_nmi_init(ops);
 }
 
-
 void oprofile_arch_exit(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
-       if (!nmi_timer)
-               op_nmi_exit();
-#endif
+       op_nmi_exit();
 }
index 75f9528..26b8a85 100644 (file)
@@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type)
        return 0;
 }
 
-static int force_arch_perfmon;
-static int force_cpu_type(const char *str, struct kernel_param *kp)
+enum __force_cpu_type {
+       reserved = 0,           /* do not force */
+       timer,
+       arch_perfmon,
+};
+
+static int force_cpu_type;
+
+static int set_cpu_type(const char *str, struct kernel_param *kp)
 {
-       if (!strcmp(str, "arch_perfmon")) {
-               force_arch_perfmon = 1;
+       if (!strcmp(str, "timer")) {
+               force_cpu_type = timer;
+               printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
+       } else if (!strcmp(str, "arch_perfmon")) {
+               force_cpu_type = arch_perfmon;
                printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+       } else {
+               force_cpu_type = 0;
        }
 
        return 0;
 }
-module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
+module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
 
 static int __init ppro_init(char **cpu_type)
 {
        __u8 cpu_model = boot_cpu_data.x86_model;
        struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
 
-       if (force_arch_perfmon && cpu_has_arch_perfmon)
+       if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
                return 0;
 
        /*
@@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
        if (!cpu_has_apic)
                return -ENODEV;
 
+       if (force_cpu_type == timer)
+               return -ENODEV;
+
        switch (vendor) {
        case X86_VENDOR_AMD:
                /* Needs to be at least an Athlon (or hammer in 32bit mode) */
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
deleted file mode 100644 (file)
index 7f8052c..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * @file nmi_timer_int.c
- *
- * @remark Copyright 2003 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Zwane Mwaikambo <zwane@linuxpower.ca>
- */
-
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/oprofile.h>
-#include <linux/rcupdate.h>
-#include <linux/kdebug.h>
-
-#include <asm/nmi.h>
-#include <asm/apic.h>
-#include <asm/ptrace.h>
-
-static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs)
-{
-       oprofile_add_sample(regs, 0);
-       return NMI_HANDLED;
-}
-
-static int timer_start(void)
-{
-       if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify,
-                                       0, "oprofile-timer"))
-               return 1;
-       return 0;
-}
-
-
-static void timer_stop(void)
-{
-       unregister_nmi_handler(NMI_LOCAL, "oprofile-timer");
-       synchronize_sched();  /* Allow already-started NMIs to complete. */
-}
-
-
-int __init op_nmi_timer_init(struct oprofile_operations *ops)
-{
-       ops->start = timer_start;
-       ops->stop = timer_stop;
-       ops->cpu_type = "timer";
-       printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
-       return 0;
-}
index 37718f0..4cf9bd0 100644 (file)
@@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
 
        spin_lock_irqsave(&rtc_lock, flags);
        efi_call_phys_prelog();
-       status = efi_call_phys2(efi_phys.get_time, tm, tc);
+       status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
+                               virt_to_phys(tc));
        efi_call_phys_epilog();
        spin_unlock_irqrestore(&rtc_lock, flags);
        return status;
@@ -352,8 +353,7 @@ void __init efi_memblock_x86_reserve_range(void)
                boot_params.efi_info.efi_memdesc_size;
        memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
        memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
-       memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
-                     "EFI memmap");
+       memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
 }
 
 #if EFI_DEBUG
@@ -397,16 +397,14 @@ void __init efi_reserve_boot_services(void)
                if ((start+size >= virt_to_phys(_text)
                                && start <= virt_to_phys(_end)) ||
                        !e820_all_mapped(start, start+size, E820_RAM) ||
-                       memblock_x86_check_reserved_size(&start, &size,
-                                                       1<<EFI_PAGE_SHIFT)) {
+                       memblock_is_region_reserved(start, size)) {
                        /* Could not reserve, skip it */
                        md->num_pages = 0;
                        memblock_dbg(PFX "Could not reserve boot range "
                                        "[0x%010llx-0x%010llx]\n",
                                                start, start+size-1);
                } else
-                       memblock_x86_reserve_range(start, start+size,
-                                                       "EFI Boot");
+                       memblock_reserve(start, size);
        }
 }
 
index f820826..d511aa9 100644 (file)
@@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
 quiet_cmd_posttest = TEST    $@
       cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
 
-posttest: $(obj)/test_get_len vmlinux
+quiet_cmd_sanitytest = TEST    $@
+      cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
+
+posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity
        $(call cmd,posttest)
+       $(call cmd,sanitytest)
 
-hostprogs-y    := test_get_len
+hostprogs-y    += test_get_len insn_sanity
 
 # -I needed for generated C source and C source which in the kernel tree.
 HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
 
+HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
 # Dependencies are also needed.
 $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
 
+$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
index eaf11f5..5f6a5b6 100644 (file)
@@ -47,7 +47,7 @@ BEGIN {
        sep_expr = "^\\|$"
        group_expr = "^Grp[0-9A-Za-z]+"
 
-       imm_expr = "^[IJAO][a-z]"
+       imm_expr = "^[IJAOL][a-z]"
        imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
        imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
        imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
@@ -59,6 +59,7 @@ BEGIN {
        imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
        imm_flag["Ob"] = "INAT_MOFFSET"
        imm_flag["Ov"] = "INAT_MOFFSET"
+       imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
 
        modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
        force64_expr = "\\([df]64\\)"
@@ -70,8 +71,12 @@ BEGIN {
        lprefix3_expr = "\\(F2\\)"
        max_lprefix = 4
 
-       vexok_expr = "\\(VEX\\)"
-       vexonly_expr = "\\(oVEX\\)"
+       # All opcodes starting with lower-case 'v' or with (v1) superscript
+       # accepts VEX prefix
+       vexok_opcode_expr = "^v.*"
+       vexok_expr = "\\(v1\\)"
+       # All opcodes with (v) superscript supports *only* VEX prefix
+       vexonly_expr = "\\(v\\)"
 
        prefix_expr = "\\(Prefix\\)"
        prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -85,8 +90,8 @@ BEGIN {
        prefix_num["SEG=GS"] = "INAT_PFX_GS"
        prefix_num["SEG=SS"] = "INAT_PFX_SS"
        prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
-       prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
-       prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+       prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
+       prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
 
        clear_vars()
 }
@@ -310,12 +315,10 @@ function convert_operands(count,opnd,       i,j,imm,mod)
                if (match(opcode, fpu_expr))
                        flags = add_flags(flags, "INAT_MODRM")
 
-               # check VEX only code
+               # check VEX codes
                if (match(ext, vexonly_expr))
                        flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
-
-               # check VEX only code
-               if (match(ext, vexok_expr))
+               else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
                        flags = add_flags(flags, "INAT_VEXOK")
 
                # check prefixes
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
new file mode 100644 (file)
index 0000000..cc2f8c1
--- /dev/null
@@ -0,0 +1,275 @@
+/*
+ * x86 decoder sanity test - based on test_get_insn.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) Hitachi, Ltd., 2011
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define unlikely(cond) (cond)
+#define ARRAY_SIZE(a)  (sizeof(a)/sizeof(a[0]))
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis against tampering.
+ * Feed random binary to instruction decoder and ensure not to
+ * access out-of-instruction-buffer.
+ */
+
+#define DEFAULT_MAX_ITER       10000
+#define INSN_NOP 0x90
+
+static const char      *prog;          /* Program name */
+static int             verbose;        /* Verbosity */
+static int             x86_64;         /* x86-64 bit mode flag */
+static unsigned int    seed;           /* Random seed */
+static unsigned long   iter_start;     /* Start of iteration number */
+static unsigned long   iter_end = DEFAULT_MAX_ITER;    /* End of iteration number */
+static FILE            *input_file;    /* Input file name */
+
+static void usage(const char *err)
+{
+       if (err)
+               fprintf(stderr, "Error: %s\n\n", err);
+       fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog);
+       fprintf(stderr, "\t-y   64bit mode\n");
+       fprintf(stderr, "\t-n   32bit mode\n");
+       fprintf(stderr, "\t-v   Verbosity(-vv dumps any decoded result)\n");
+       fprintf(stderr, "\t-s   Give a random seed (and iteration number)\n");
+       fprintf(stderr, "\t-m   Give a maximum iteration number\n");
+       fprintf(stderr, "\t-i   Give an input file with decoded binary\n");
+       exit(1);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+                      struct insn_field *field)
+{
+       fprintf(fp, "%s.%s = {\n", indent, name);
+       fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+               indent, field->value, field->bytes[0], field->bytes[1],
+               field->bytes[2], field->bytes[3]);
+       fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+               field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+       fprintf(fp, "Instruction = {\n");
+       dump_field(fp, "prefixes", "\t",        &insn->prefixes);
+       dump_field(fp, "rex_prefix", "\t",      &insn->rex_prefix);
+       dump_field(fp, "vex_prefix", "\t",      &insn->vex_prefix);
+       dump_field(fp, "opcode", "\t",          &insn->opcode);
+       dump_field(fp, "modrm", "\t",           &insn->modrm);
+       dump_field(fp, "sib", "\t",             &insn->sib);
+       dump_field(fp, "displacement", "\t",    &insn->displacement);
+       dump_field(fp, "immediate1", "\t",      &insn->immediate1);
+       dump_field(fp, "immediate2", "\t",      &insn->immediate2);
+       fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+               insn->attr, insn->opnd_bytes, insn->addr_bytes);
+       fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+               insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
+                       unsigned char *insn_buf, struct insn *insn)
+{
+       int i;
+
+       fprintf(fp, "%s:\n", msg);
+
+       dump_insn(fp, insn);
+
+       fprintf(fp, "You can reproduce this with below command(s);\n");
+
+       /* Input a decoded instruction sequence directly */
+       fprintf(fp, " $ echo ");
+       for (i = 0; i < MAX_INSN_SIZE; i++)
+               fprintf(fp, " %02x", insn_buf[i]);
+       fprintf(fp, " | %s -i -\n", prog);
+
+       if (!input_file) {
+               fprintf(fp, "Or \n");
+               /* Give a seed and iteration number */
+               fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter);
+       }
+}
+
+static void init_random_seed(void)
+{
+       int fd;
+
+       fd = open("/dev/urandom", O_RDONLY);
+       if (fd < 0)
+               goto fail;
+
+       if (read(fd, &seed, sizeof(seed)) != sizeof(seed))
+               goto fail;
+
+       close(fd);
+       return;
+fail:
+       usage("Failed to open /dev/urandom");
+}
+
+/* Read given instruction sequence from the input file */
+static int read_next_insn(unsigned char *insn_buf)
+{
+       char buf[256]  = "", *tmp;
+       int i;
+
+       tmp = fgets(buf, ARRAY_SIZE(buf), input_file);
+       if (tmp == NULL || feof(input_file))
+               return 0;
+
+       for (i = 0; i < MAX_INSN_SIZE; i++) {
+               insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
+               if (*tmp != ' ')
+                       break;
+       }
+
+       return i;
+}
+
+static int generate_insn(unsigned char *insn_buf)
+{
+       int i;
+
+       if (input_file)
+               return read_next_insn(insn_buf);
+
+       /* Fills buffer with random binary up to MAX_INSN_SIZE */
+       for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
+               *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
+
+       while (i < MAX_INSN_SIZE)
+               insn_buf[i++] = random() & 0xff;
+
+       return i;
+}
+
+static void parse_args(int argc, char **argv)
+{
+       int c;
+       char *tmp = NULL;
+       int set_seed = 0;
+
+       prog = argv[0];
+       while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) {
+               switch (c) {
+               case 'y':
+                       x86_64 = 1;
+                       break;
+               case 'n':
+                       x86_64 = 0;
+                       break;
+               case 'v':
+                       verbose++;
+                       break;
+               case 'i':
+                       if (strcmp("-", optarg) == 0)
+                               input_file = stdin;
+                       else
+                               input_file = fopen(optarg, "r");
+                       if (!input_file)
+                               usage("Failed to open input file");
+                       break;
+               case 's':
+                       seed = (unsigned int)strtoul(optarg, &tmp, 0);
+                       if (*tmp == ',') {
+                               optarg = tmp + 1;
+                               iter_start = strtoul(optarg, &tmp, 0);
+                       }
+                       if (*tmp != '\0' || tmp == optarg)
+                               usage("Failed to parse seed");
+                       set_seed = 1;
+                       break;
+               case 'm':
+                       iter_end = strtoul(optarg, &tmp, 0);
+                       if (*tmp != '\0' || tmp == optarg)
+                               usage("Failed to parse max_iter");
+                       break;
+               default:
+                       usage(NULL);
+               }
+       }
+
+       /* Check errors */
+       if (iter_end < iter_start)
+               usage("Max iteration number must be bigger than iter-num");
+
+       if (set_seed && input_file)
+               usage("Don't use input file (-i) with random seed (-s)");
+
+       /* Initialize random seed */
+       if (!input_file) {
+               if (!set_seed)  /* No seed is given */
+                       init_random_seed();
+               srand(seed);
+       }
+}
+
+int main(int argc, char **argv)
+{
+       struct insn insn;
+       int insns = 0;
+       int errors = 0;
+       unsigned long i;
+       unsigned char insn_buf[MAX_INSN_SIZE * 2];
+
+       parse_args(argc, argv);
+
+       /* Prepare stop bytes with NOPs */
+       memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
+
+       for (i = 0; i < iter_end; i++) {
+               if (generate_insn(insn_buf) <= 0)
+                       break;
+
+               if (i < iter_start)     /* Skip to given iteration number */
+                       continue;
+
+               /* Decode an instruction */
+               insn_init(&insn, insn_buf, x86_64);
+               insn_get_length(&insn);
+
+               if (insn.next_byte <= insn.kaddr ||
+                   insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
+                       /* Access out-of-range memory */
+                       dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
+                       errors++;
+               } else if (verbose && !insn_complete(&insn))
+                       dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
+               else if (verbose >= 2)
+                       dump_insn(stdout, &insn);
+               insns++;
+       }
+
+       fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed);
+
+       return errors ? 1 : 0;
+}
index 1f92865..12eb07b 100644 (file)
@@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void)
        local_irq_disable();
        early_boot_irqs_disabled = true;
 
-       memblock_init();
-
        xen_raw_console_write("mapping kernel into physical memory\n");
        pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
        xen_ident_map_ISA();
index 87f6673..f4bf8aa 100644 (file)
@@ -1774,10 +1774,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
        __xen_write_cr3(true, __pa(pgd));
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 
-       memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
-                     __pa(xen_start_info->pt_base +
-                          xen_start_info->nr_pt_frames * PAGE_SIZE),
-                     "XEN PAGETABLES");
+       memblock_reserve(__pa(xen_start_info->pt_base),
+                        xen_start_info->nr_pt_frames * PAGE_SIZE);
 
        return pgd;
 }
@@ -1853,10 +1851,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
                          PFN_DOWN(__pa(initial_page_table)));
        xen_write_cr3(__pa(initial_page_table));
 
-       memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
-                     __pa(xen_start_info->pt_base +
-                          xen_start_info->nr_pt_frames * PAGE_SIZE),
-                     "XEN PAGETABLES");
+       memblock_reserve(__pa(xen_start_info->pt_base),
+                        xen_start_info->nr_pt_frames * PAGE_SIZE));
 
        return initial_page_table;
 }
index b2c7179..e03c636 100644 (file)
@@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
        if (i == XEN_EXTRA_MEM_MAX_REGIONS)
                printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 
-       memblock_x86_reserve_range(start, start + size, "XEN EXTRA");
+       memblock_reserve(start, size);
 
        xen_max_p2m_pfn = PFN_DOWN(start + size);
 
@@ -311,9 +311,8 @@ char * __init xen_memory_setup(void)
         *  - xen_start_info
         * See comment above "struct start_info" in <xen/interface/xen.h>
         */
-       memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
-                     __pa(xen_start_info->pt_base),
-                       "XEN START INFO");
+       memblock_reserve(__pa(xen_start_info->mfn_list),
+                        xen_start_info->pt_base - xen_start_info->mfn_list);
 
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
index f3e5eb4..ac62f9c 100644 (file)
@@ -41,14 +41,6 @@ static struct clocksource ccount_clocksource = {
        .rating = 200,
        .read = ccount_read,
        .mask = CLOCKSOURCE_MASK(32),
-       /*
-        * With a shift of 22 the lower limit of the cpu clock is
-        * 1MHz, where NSEC_PER_CCOUNT is 1000 or a bit less than
-        * 2^10: Since we have 32 bits and the multiplicator can
-        * already take up as much as 10 bits, this leaves us with
-        * remaining upper 22 bits.
-        */
-       .shift = 22,
 };
 
 static irqreturn_t timer_interrupt(int irq, void *dev_id);
@@ -66,10 +58,7 @@ void __init time_init(void)
        printk("%d.%02d MHz\n", (int)ccount_per_jiffy/(1000000/HZ),
                        (int)(ccount_per_jiffy/(10000/HZ))%100);
 #endif
-       ccount_clocksource.mult =
-               clocksource_hz2mult(CCOUNT_PER_JIFFY * HZ,
-                               ccount_clocksource.shift);
-       clocksource_register(&ccount_clocksource);
+       clocksource_register_hz(&ccount_clocksource, CCOUNT_PER_JIFFY * HZ);
 
        /* Initialize the linux timer interrupt. */
 
index ca939fc..d510c2a 100644 (file)
@@ -179,6 +179,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
  */
 EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
 
+/*
+ * Is it an unrecognized ioctl? The correct returns are either
+ * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
+ * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
+ * code before returning.
+ *
+ * Confused drivers sometimes return EINVAL, which is wrong. It
+ * means "I understood the ioctl command, but the parameters to
+ * it were wrong".
+ *
+ * We should aim to just fix the broken drivers, the EINVAL case
+ * should go away.
+ */
+static inline int is_unrecognized_ioctl(int ret)
+{
+       return  ret == -EINVAL ||
+               ret == -ENOTTY ||
+               ret == -ENOIOCTLCMD;
+}
+
 /*
  * always keep this in sync with compat_blkdev_ioctl()
  */
@@ -196,8 +216,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
                        return -EACCES;
 
                ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-               /* -EINVAL to handle old uncorrected drivers */
-               if (ret != -EINVAL && ret != -ENOTTY)
+               if (!is_unrecognized_ioctl(ret))
                        return ret;
 
                fsync_bdev(bdev);
@@ -206,8 +225,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 
        case BLKROSET:
                ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-               /* -EINVAL to handle old uncorrected drivers */
-               if (ret != -EINVAL && ret != -ENOTTY)
+               if (!is_unrecognized_ioctl(ret))
                        return ret;
                if (!capable(CAP_SYS_ADMIN))
                        return -EACCES;
index 251acea..3991502 100644 (file)
@@ -247,6 +247,13 @@ struct sys_device *get_cpu_sysdev(unsigned cpu)
 }
 EXPORT_SYMBOL_GPL(get_cpu_sysdev);
 
+bool cpu_is_hotpluggable(unsigned cpu)
+{
+       struct sys_device *dev = get_cpu_sysdev(cpu);
+       return dev && container_of(dev, struct cpu, sysdev)->hotpluggable;
+}
+EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
+
 int __init cpu_dev_init(void)
 {
        int err;
index 6035ab8..85da874 100644 (file)
@@ -624,8 +624,8 @@ static struct timer_rand_state input_timer_state;
 static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
 {
        struct {
-               cycles_t cycles;
                long jiffies;
+               unsigned cycles;
                unsigned num;
        } sample;
        long delta, delta2, delta3;
@@ -637,7 +637,11 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
                goto out;
 
        sample.jiffies = jiffies;
-       sample.cycles = get_cycles();
+
+       /* Use arch random value, fall back to cycles */
+       if (!arch_get_random_int(&sample.cycles))
+               sample.cycles = get_cycles();
+
        sample.num = num;
        mix_pool_bytes(&input_pool, &sample, sizeof(sample));
 
index effe797..6b5cf02 100644 (file)
@@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
 #ifndef CONFIG_X86_64
 #include <asm/mach_timer.h>
 #define PMTMR_EXPECTED_RATE \
-  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
+  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10))
 /*
  * Some boards have the PMTMR running way too fast. We check
  * the PMTMR rate against PIT channel 2 to catch these cases.
index 27c49e6..e7cab2d 100644 (file)
@@ -53,7 +53,7 @@ static cycle_t i8253_read(struct clocksource *cs)
        count |= inb_p(PIT_CH0) << 8;
 
        /* VIA686a test code... reset the latch if count > max + 1 */
-       if (count > LATCH) {
+       if (count > PIT_LATCH) {
                outb_p(0x34, PIT_MODE);
                outb_p(PIT_LATCH & 0xff, PIT_CH0);
                outb_p(PIT_LATCH >> 8, PIT_CH0);
@@ -114,8 +114,8 @@ static void init_pit_timer(enum clock_event_mode mode,
        case CLOCK_EVT_MODE_PERIODIC:
                /* binary, mode 2, LSB/MSB, ch 0 */
                outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
-               outb_p(LATCH >> 8 , PIT_CH0);           /* MSB */
+               outb_p(PIT_LATCH & 0xff , PIT_CH0);     /* LSB */
+               outb_p(PIT_LATCH >> 8 , PIT_CH0);               /* MSB */
                break;
 
        case CLOCK_EVT_MODE_SHUTDOWN:
index 79c47e8..55d0f95 100644 (file)
@@ -59,7 +59,6 @@ static struct clocksource clksrc = {
        .rating         = 200,
        .read           = tc_get_cycles,
        .mask           = CLOCKSOURCE_MASK(32),
-       .shift          = 18,
        .flags          = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -256,7 +255,6 @@ static int __init tcb_clksrc_init(void)
                best_divisor_idx = i;
        }
 
-       clksrc.mult = clocksource_hz2mult(divided_rate, clksrc.shift);
 
        printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK,
                        divided_rate / 1000000,
@@ -292,7 +290,7 @@ static int __init tcb_clksrc_init(void)
        __raw_writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR);
 
        /* and away we go! */
-       clocksource_register(&clksrc);
+       clocksource_register_hz(&clksrc, divided_rate);
 
        /* channel 2:  periodic and oneshot timer support */
        setup_clkevents(tc, clk32k_divisor_idx);
index c97b468..235a340 100644 (file)
@@ -95,27 +95,26 @@ static struct dbs_tuners {
        .freq_step = 5,
 };
 
-static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
-                                                       cputime64_t *wall)
+static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 {
-       cputime64_t idle_time;
-       cputime64_t cur_wall_time;
-       cputime64_t busy_time;
+       u64 idle_time;
+       u64 cur_wall_time;
+       u64 busy_time;
 
        cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
-       busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
-                       kstat_cpu(cpu).cpustat.system);
 
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
+       busy_time  = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
-       idle_time = cputime64_sub(cur_wall_time, busy_time);
+       idle_time = cur_wall_time - busy_time;
        if (wall)
-               *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
+               *wall = jiffies_to_usecs(cur_wall_time);
 
-       return (cputime64_t)jiffies_to_usecs(idle_time);
+       return jiffies_to_usecs(idle_time);
 }
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
                dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &dbs_info->prev_cpu_wall);
                if (dbs_tuners_ins.ignore_nice)
-                       dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+                       dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
        }
        return count;
 }
@@ -353,20 +352,20 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
 
-               wall_time = (unsigned int) cputime64_sub(cur_wall_time,
-                               j_dbs_info->prev_cpu_wall);
+               wall_time = (unsigned int)
+                       (cur_wall_time - j_dbs_info->prev_cpu_wall);
                j_dbs_info->prev_cpu_wall = cur_wall_time;
 
-               idle_time = (unsigned int) cputime64_sub(cur_idle_time,
-                               j_dbs_info->prev_cpu_idle);
+               idle_time = (unsigned int)
+                       (cur_idle_time - j_dbs_info->prev_cpu_idle);
                j_dbs_info->prev_cpu_idle = cur_idle_time;
 
                if (dbs_tuners_ins.ignore_nice) {
-                       cputime64_t cur_nice;
+                       u64 cur_nice;
                        unsigned long cur_nice_jiffies;
 
-                       cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
-                                        j_dbs_info->prev_cpu_nice);
+                       cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
+                                        j_dbs_info->prev_cpu_nice;
                        /*
                         * Assumption: nice time between sampling periods will
                         * be less than 2^32 jiffies for 32 bit sys
@@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                        cur_nice_jiffies = (unsigned long)
                                        cputime64_to_jiffies64(cur_nice);
 
-                       j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+                       j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
                        idle_time += jiffies_to_usecs(cur_nice_jiffies);
                }
 
@@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &j_dbs_info->prev_cpu_wall);
-                       if (dbs_tuners_ins.ignore_nice) {
+                       if (dbs_tuners_ins.ignore_nice)
                                j_dbs_info->prev_cpu_nice =
-                                               kstat_cpu(j).cpustat.nice;
-                       }
+                                               kcpustat_cpu(j).cpustat[CPUTIME_NICE];
                }
                this_dbs_info->down_skip = 0;
                this_dbs_info->requested_freq = policy->cur;
index fa8af4e..3d679ee 100644 (file)
@@ -119,27 +119,26 @@ static struct dbs_tuners {
        .powersave_bias = 0,
 };
 
-static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu,
-                                                       cputime64_t *wall)
+static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 {
-       cputime64_t idle_time;
-       cputime64_t cur_wall_time;
-       cputime64_t busy_time;
+       u64 idle_time;
+       u64 cur_wall_time;
+       u64 busy_time;
 
        cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
-       busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
-                       kstat_cpu(cpu).cpustat.system);
 
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
-       busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice);
+       busy_time  = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+       busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
-       idle_time = cputime64_sub(cur_wall_time, busy_time);
+       idle_time = cur_wall_time - busy_time;
        if (wall)
-               *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time);
+               *wall = jiffies_to_usecs(cur_wall_time);
 
-       return (cputime64_t)jiffies_to_usecs(idle_time);
+       return jiffies_to_usecs(idle_time);
 }
 
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
                dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &dbs_info->prev_cpu_wall);
                if (dbs_tuners_ins.ignore_nice)
-                       dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+                       dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
        }
        return count;
@@ -442,24 +441,24 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
                cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
 
-               wall_time = (unsigned int) cputime64_sub(cur_wall_time,
-                               j_dbs_info->prev_cpu_wall);
+               wall_time = (unsigned int)
+                       (cur_wall_time - j_dbs_info->prev_cpu_wall);
                j_dbs_info->prev_cpu_wall = cur_wall_time;
 
-               idle_time = (unsigned int) cputime64_sub(cur_idle_time,
-                               j_dbs_info->prev_cpu_idle);
+               idle_time = (unsigned int)
+                       (cur_idle_time - j_dbs_info->prev_cpu_idle);
                j_dbs_info->prev_cpu_idle = cur_idle_time;
 
-               iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
-                               j_dbs_info->prev_cpu_iowait);
+               iowait_time = (unsigned int)
+                       (cur_iowait_time - j_dbs_info->prev_cpu_iowait);
                j_dbs_info->prev_cpu_iowait = cur_iowait_time;
 
                if (dbs_tuners_ins.ignore_nice) {
-                       cputime64_t cur_nice;
+                       u64 cur_nice;
                        unsigned long cur_nice_jiffies;
 
-                       cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice,
-                                        j_dbs_info->prev_cpu_nice);
+                       cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
+                                        j_dbs_info->prev_cpu_nice;
                        /*
                         * Assumption: nice time between sampling periods will
                         * be less than 2^32 jiffies for 32 bit sys
@@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
                        cur_nice_jiffies = (unsigned long)
                                        cputime64_to_jiffies64(cur_nice);
 
-                       j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice;
+                       j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
                        idle_time += jiffies_to_usecs(cur_nice_jiffies);
                }
 
@@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 
                        j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
                                                &j_dbs_info->prev_cpu_wall);
-                       if (dbs_tuners_ins.ignore_nice) {
+                       if (dbs_tuners_ins.ignore_nice)
                                j_dbs_info->prev_cpu_nice =
-                                               kstat_cpu(j).cpustat.nice;
-                       }
+                                               kcpustat_cpu(j).cpustat[CPUTIME_NICE];
                }
                this_dbs_info->cpu = cpu;
                this_dbs_info->rate_mult = 1;
index c5072a9..2a508ed 100644 (file)
@@ -61,9 +61,8 @@ static int cpufreq_stats_update(unsigned int cpu)
        spin_lock(&cpufreq_stats_lock);
        stat = per_cpu(cpufreq_stats_table, cpu);
        if (stat->time_in_state)
-               stat->time_in_state[stat->last_index] =
-                       cputime64_add(stat->time_in_state[stat->last_index],
-                                     cputime_sub(cur_time, stat->last_time));
+               stat->time_in_state[stat->last_index] +=
+                       cur_time - stat->last_time;
        stat->last_time = cur_time;
        spin_unlock(&cpufreq_stats_lock);
        return 0;
index 70ad892..8568d9b 100644 (file)
@@ -2234,7 +2234,7 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev)
        if (pvt->enable_scrub)
                disable_sdram_scrub_setting(mci);
 
-       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &i7_mce_dec);
+       mce_unregister_decode_chain(&i7_mce_dec);
 
        /* Disable EDAC polling */
        i7core_pci_ctl_release(pvt);
@@ -2336,7 +2336,7 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev)
        /* DCLK for scrub rate setting */
        pvt->dclk_freq = get_dclk_freq();
 
-       atomic_notifier_chain_register(&x86_mce_decoder_chain, &i7_mce_dec);
+       mce_register_decode_chain(&i7_mce_dec);
 
        return 0;
 
index d0864d9..bd926ea 100644 (file)
@@ -884,7 +884,7 @@ static int __init mce_amd_init(void)
 
        pr_info("MCE: In-kernel MCE decoding enabled.\n");
 
-       atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+       mce_register_decode_chain(&amd_mce_dec_nb);
 
        return 0;
 }
@@ -893,7 +893,7 @@ early_initcall(mce_amd_init);
 #ifdef MODULE
 static void __exit mce_amd_exit(void)
 {
-       atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
+       mce_unregister_decode_chain(&amd_mce_dec_nb);
        kfree(fam_ops);
 }
 
index 7a402bf..1dc118d 100644 (file)
@@ -1609,11 +1609,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
                mce->cpuvendor, mce->cpuid, mce->time,
                mce->socketid, mce->apicid);
 
-#ifdef CONFIG_SMP
        /* Only handle if it is the right mc controller */
        if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc)
                return NOTIFY_DONE;
-#endif
 
        smp_rmb();
        if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) {
@@ -1661,8 +1659,7 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev)
        debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n",
                __func__, mci, &sbridge_dev->pdev[0]->dev);
 
-       atomic_notifier_chain_unregister(&x86_mce_decoder_chain,
-                                        &sbridge_mce_dec);
+       mce_unregister_decode_chain(&sbridge_mce_dec);
 
        /* Remove MC sysfs nodes */
        edac_mc_del_mc(mci->dev);
@@ -1731,8 +1728,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev)
                goto fail0;
        }
 
-       atomic_notifier_chain_register(&x86_mce_decoder_chain,
-                                      &sbridge_mce_dec);
+       mce_register_decode_chain(&sbridge_mce_dec);
        return 0;
 
 fail0:
index 104b376..1fdef88 100644 (file)
@@ -57,16 +57,15 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
 #define TOTAL_ATTRS            (MAX_CORE_ATTRS + 1)
 #define MAX_CORE_DATA          (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
 
-#ifdef CONFIG_SMP
 #define TO_PHYS_ID(cpu)                cpu_data(cpu).phys_proc_id
 #define TO_CORE_ID(cpu)                cpu_data(cpu).cpu_core_id
+#define TO_ATTR_NO(cpu)                (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
+
+#ifdef CONFIG_SMP
 #define for_each_sibling(i, cpu)       for_each_cpu(i, cpu_sibling_mask(cpu))
 #else
-#define TO_PHYS_ID(cpu)                (cpu)
-#define TO_CORE_ID(cpu)                (cpu)
 #define for_each_sibling(i, cpu)       for (i = 0; false; )
 #endif
-#define TO_ATTR_NO(cpu)                (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO)
 
 /*
  * Per-Core Temperature Data
index bdc447f..31053a9 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/tboot.h>
 #include <linux/dmi.h>
 #include <linux/pci-ats.h>
+#include <linux/memblock.h>
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
@@ -2188,18 +2189,6 @@ static inline void iommu_prepare_isa(void)
 
 static int md_domain_init(struct dmar_domain *domain, int guest_width);
 
-static int __init si_domain_work_fn(unsigned long start_pfn,
-                                   unsigned long end_pfn, void *datax)
-{
-       int *ret = datax;
-
-       *ret = iommu_domain_identity_map(si_domain,
-                                        (uint64_t)start_pfn << PAGE_SHIFT,
-                                        (uint64_t)end_pfn << PAGE_SHIFT);
-       return *ret;
-
-}
-
 static int __init si_domain_init(int hw)
 {
        struct dmar_drhd_unit *drhd;
@@ -2231,9 +2220,15 @@ static int __init si_domain_init(int hw)
                return 0;
 
        for_each_online_node(nid) {
-               work_with_active_regions(nid, si_domain_work_fn, &ret);
-               if (ret)
-                       return ret;
+               unsigned long start_pfn, end_pfn;
+               int i;
+
+               for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+                       ret = iommu_domain_identity_map(si_domain,
+                                       PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+                       if (ret)
+                               return ret;
+               }
        }
 
        return 0;
index 65af42f..3980903 100644 (file)
@@ -697,7 +697,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
         * interrupts are enabled.  We always leave interrupts enabled while
         * running the Guest.
         */
-       regs->eflags = X86_EFLAGS_IF | 0x2;
+       regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
 
        /*
         * The "Extended Instruction Pointer" register says where the Guest is
index 2637c13..6dc26b6 100644 (file)
@@ -81,13 +81,13 @@ static int rackmeter_ignore_nice;
  */
 static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
 {
-       cputime64_t retval;
+       u64 retval;
 
-       retval = cputime64_add(kstat_cpu(cpu).cpustat.idle,
-                       kstat_cpu(cpu).cpustat.iowait);
+       retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
+                kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 
        if (rackmeter_ignore_nice)
-               retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice);
+               retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
 
        return retval;
 }
@@ -220,13 +220,11 @@ static void rackmeter_do_timer(struct work_struct *work)
        int i, offset, load, cumm, pause;
 
        cur_jiffies = jiffies64_to_cputime64(get_jiffies_64());
-       total_ticks = (unsigned int)cputime64_sub(cur_jiffies,
-                                                 rcpu->prev_wall);
+       total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall);
        rcpu->prev_wall = cur_jiffies;
 
        total_idle_ticks = get_cpu_idle_time(cpu);
-       idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks,
-                               rcpu->prev_idle);
+       idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle);
        rcpu->prev_idle = total_idle_ticks;
 
        /* We do a very dumb calculation to update the LEDs for now,
diff --git a/drivers/oprofile/nmi_timer_int.c b/drivers/oprofile/nmi_timer_int.c
new file mode 100644 (file)
index 0000000..76f1c93
--- /dev/null
@@ -0,0 +1,173 @@
+/**
+ * @file nmi_timer_int.c
+ *
+ * @remark Copyright 2011 Advanced Micro Devices, Inc.
+ *
+ * @author Robert Richter <robert.richter@amd.com>
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/oprofile.h>
+#include <linux/perf_event.h>
+
+#ifdef CONFIG_OPROFILE_NMI_TIMER
+
+static DEFINE_PER_CPU(struct perf_event *, nmi_timer_events);
+static int ctr_running;
+
+static struct perf_event_attr nmi_timer_attr = {
+       .type           = PERF_TYPE_HARDWARE,
+       .config         = PERF_COUNT_HW_CPU_CYCLES,
+       .size           = sizeof(struct perf_event_attr),
+       .pinned         = 1,
+       .disabled       = 1,
+};
+
+static void nmi_timer_callback(struct perf_event *event,
+                              struct perf_sample_data *data,
+                              struct pt_regs *regs)
+{
+       event->hw.interrupts = 0;       /* don't throttle interrupts */
+       oprofile_add_sample(regs, 0);
+}
+
+static int nmi_timer_start_cpu(int cpu)
+{
+       struct perf_event *event = per_cpu(nmi_timer_events, cpu);
+
+       if (!event) {
+               event = perf_event_create_kernel_counter(&nmi_timer_attr, cpu, NULL,
+                                                        nmi_timer_callback, NULL);
+               if (IS_ERR(event))
+                       return PTR_ERR(event);
+               per_cpu(nmi_timer_events, cpu) = event;
+       }
+
+       if (event && ctr_running)
+               perf_event_enable(event);
+
+       return 0;
+}
+
+static void nmi_timer_stop_cpu(int cpu)
+{
+       struct perf_event *event = per_cpu(nmi_timer_events, cpu);
+
+       if (event && ctr_running)
+               perf_event_disable(event);
+}
+
+static int nmi_timer_cpu_notifier(struct notifier_block *b, unsigned long action,
+                                 void *data)
+{
+       int cpu = (unsigned long)data;
+       switch (action) {
+       case CPU_DOWN_FAILED:
+       case CPU_ONLINE:
+               nmi_timer_start_cpu(cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+               nmi_timer_stop_cpu(cpu);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block nmi_timer_cpu_nb = {
+       .notifier_call = nmi_timer_cpu_notifier
+};
+
+static int nmi_timer_start(void)
+{
+       int cpu;
+
+       get_online_cpus();
+       ctr_running = 1;
+       for_each_online_cpu(cpu)
+               nmi_timer_start_cpu(cpu);
+       put_online_cpus();
+
+       return 0;
+}
+
+static void nmi_timer_stop(void)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               nmi_timer_stop_cpu(cpu);
+       ctr_running = 0;
+       put_online_cpus();
+}
+
+static void nmi_timer_shutdown(void)
+{
+       struct perf_event *event;
+       int cpu;
+
+       get_online_cpus();
+       unregister_cpu_notifier(&nmi_timer_cpu_nb);
+       for_each_possible_cpu(cpu) {
+               event = per_cpu(nmi_timer_events, cpu);
+               if (!event)
+                       continue;
+               perf_event_disable(event);
+               per_cpu(nmi_timer_events, cpu) = NULL;
+               perf_event_release_kernel(event);
+       }
+
+       put_online_cpus();
+}
+
+static int nmi_timer_setup(void)
+{
+       int cpu, err;
+       u64 period;
+
+       /* clock cycles per tick: */
+       period = (u64)cpu_khz * 1000;
+       do_div(period, HZ);
+       nmi_timer_attr.sample_period = period;
+
+       get_online_cpus();
+       err = register_cpu_notifier(&nmi_timer_cpu_nb);
+       if (err)
+               goto out;
+       /* can't attach events to offline cpus: */
+       for_each_online_cpu(cpu) {
+               err = nmi_timer_start_cpu(cpu);
+               if (err)
+                       break;
+       }
+       if (err)
+               nmi_timer_shutdown();
+out:
+       put_online_cpus();
+       return err;
+}
+
+int __init op_nmi_timer_init(struct oprofile_operations *ops)
+{
+       int err = 0;
+
+       err = nmi_timer_setup();
+       if (err)
+               return err;
+       nmi_timer_shutdown();           /* only check, don't alloc */
+
+       ops->create_files       = NULL;
+       ops->setup              = nmi_timer_setup;
+       ops->shutdown           = nmi_timer_shutdown;
+       ops->start              = nmi_timer_start;
+       ops->stop               = nmi_timer_stop;
+       ops->cpu_type           = "timer";
+
+       printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
+
+       return 0;
+}
+
+#endif
index f8c752e..ed2c3ec 100644 (file)
@@ -246,37 +246,31 @@ static int __init oprofile_init(void)
        int err;
 
        /* always init architecture to setup backtrace support */
+       timer_mode = 0;
        err = oprofile_arch_init(&oprofile_ops);
+       if (!err) {
+               if (!timer && !oprofilefs_register())
+                       return 0;
+               oprofile_arch_exit();
+       }
 
-       timer_mode = err || timer;      /* fall back to timer mode on errors */
-       if (timer_mode) {
-               if (!err)
-                       oprofile_arch_exit();
+       /* setup timer mode: */
+       timer_mode = 1;
+       /* no nmi timer mode if oprofile.timer is set */
+       if (timer || op_nmi_timer_init(&oprofile_ops)) {
                err = oprofile_timer_init(&oprofile_ops);
                if (err)
                        return err;
        }
 
-       err = oprofilefs_register();
-       if (!err)
-               return 0;
-
-       /* failed */
-       if (timer_mode)
-               oprofile_timer_exit();
-       else
-               oprofile_arch_exit();
-
-       return err;
+       return oprofilefs_register();
 }
 
 
 static void __exit oprofile_exit(void)
 {
        oprofilefs_unregister();
-       if (timer_mode)
-               oprofile_timer_exit();
-       else
+       if (!timer_mode)
                oprofile_arch_exit();
 }
 
index 177b73d..d32ef81 100644 (file)
@@ -35,7 +35,15 @@ struct dentry;
 
 void oprofile_create_files(struct super_block *sb, struct dentry *root);
 int oprofile_timer_init(struct oprofile_operations *ops);
-void oprofile_timer_exit(void);
+#ifdef CONFIG_OPROFILE_NMI_TIMER
+int op_nmi_timer_init(struct oprofile_operations *ops);
+#else
+static inline int op_nmi_timer_init(struct oprofile_operations *ops)
+{
+       return -ENODEV;
+}
+#endif
+
 
 int oprofile_set_ulong(unsigned long *addr, unsigned long val);
 int oprofile_set_timeout(unsigned long time);
index 878fba1..93404f7 100644 (file)
@@ -97,24 +97,24 @@ static struct notifier_block __refdata oprofile_cpu_notifier = {
        .notifier_call = oprofile_cpu_notify,
 };
 
-int oprofile_timer_init(struct oprofile_operations *ops)
+static int oprofile_hrtimer_setup(void)
 {
-       int rc;
-
-       rc = register_hotcpu_notifier(&oprofile_cpu_notifier);
-       if (rc)
-               return rc;
-       ops->create_files = NULL;
-       ops->setup = NULL;
-       ops->shutdown = NULL;
-       ops->start = oprofile_hrtimer_start;
-       ops->stop = oprofile_hrtimer_stop;
-       ops->cpu_type = "timer";
-       printk(KERN_INFO "oprofile: using timer interrupt.\n");
-       return 0;
+       return register_hotcpu_notifier(&oprofile_cpu_notifier);
 }
 
-void oprofile_timer_exit(void)
+static void oprofile_hrtimer_shutdown(void)
 {
        unregister_hotcpu_notifier(&oprofile_cpu_notifier);
 }
+
+int oprofile_timer_init(struct oprofile_operations *ops)
+{
+       ops->create_files       = NULL;
+       ops->setup              = oprofile_hrtimer_setup;
+       ops->shutdown           = oprofile_hrtimer_shutdown;
+       ops->start              = oprofile_hrtimer_start;
+       ops->stop               = oprofile_hrtimer_stop;
+       ops->cpu_type           = "timer";
+       printk(KERN_INFO "oprofile: using timer interrupt.\n");
+       return 0;
+}
index f02b523..37856f7 100644 (file)
@@ -98,11 +98,11 @@ config PCI_PASID
          If unsure, say N.
 
 config PCI_IOAPIC
-       bool
+       tristate "PCI IO-APIC hotplug support" if X86
        depends on PCI
        depends on ACPI
        depends on HOTPLUG
-       default y
+       default !X86
 
 config PCI_LABEL
        def_bool y if (DMI || ACPI)
index 5775638..205af8d 100644 (file)
@@ -17,7 +17,7 @@
  */
 
 #include <linux/pci.h>
-#include <linux/export.h>
+#include <linux/module.h>
 #include <linux/acpi.h>
 #include <linux/slab.h>
 #include <acpi/acpi_bus.h>
@@ -27,7 +27,7 @@ struct ioapic {
        u32             gsi_base;
 };
 
-static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
+static int __devinit ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 {
        acpi_handle handle;
        acpi_status status;
@@ -88,7 +88,7 @@ exit_free:
        return -ENODEV;
 }
 
-static void ioapic_remove(struct pci_dev *dev)
+static void __devexit ioapic_remove(struct pci_dev *dev)
 {
        struct ioapic *ioapic = pci_get_drvdata(dev);
 
@@ -99,13 +99,12 @@ static void ioapic_remove(struct pci_dev *dev)
 }
 
 
-static struct pci_device_id ioapic_devices[] = {
-       { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-         PCI_CLASS_SYSTEM_PIC_IOAPIC << 8, 0xffff00, },
-       { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-         PCI_CLASS_SYSTEM_PIC_IOXAPIC << 8, 0xffff00, },
+static DEFINE_PCI_DEVICE_TABLE(ioapic_devices) = {
+       { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOAPIC, ~0) },
+       { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOXAPIC, ~0) },
        { }
 };
+MODULE_DEVICE_TABLE(pci, ioapic_devices);
 
 static struct pci_driver ioapic_driver = {
        .name           = "ioapic",
index 51352de..a10e428 100644 (file)
@@ -1506,35 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        return -ENOIOCTLCMD;
 }
 
-static void compat_ioctl_error(struct file *filp, unsigned int fd,
-               unsigned int cmd, unsigned long arg)
-{
-       char buf[10];
-       char *fn = "?";
-       char *path;
-
-       /* find the name of the device. */
-       path = (char *)__get_free_page(GFP_KERNEL);
-       if (path) {
-               fn = d_path(&filp->f_path, path, PAGE_SIZE);
-               if (IS_ERR(fn))
-                       fn = "?";
-       }
-
-        sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK);
-       if (!isprint(buf[1]))
-               sprintf(buf, "%02x", buf[1]);
-       compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
-                       "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n",
-                       current->comm, current->pid,
-                       (int)fd, (unsigned int)cmd, buf,
-                       (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
-                       (unsigned int)arg, fn);
-
-       if (path)
-               free_page((unsigned long)path);
-}
-
 static int compat_ioctl_check_table(unsigned int xcmd)
 {
        int i;
@@ -1621,13 +1592,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
                goto found_handler;
 
        error = do_ioctl_trans(fd, cmd, arg, filp);
-       if (error == -ENOIOCTLCMD) {
-               static int count;
-
-               if (++count <= 50)
-                       compat_ioctl_error(filp, fd, cmd, arg);
-               error = -EINVAL;
-       }
+       if (error == -ENOIOCTLCMD)
+               error = -ENOTTY;
 
        goto out_fput;
 
index 1d9b9fc..066836e 100644 (file)
@@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
 
        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
-               error = -EINVAL;
+               error = -ENOTTY;
  out:
        return error;
 }
index 3a1dafd..8c344f0 100644 (file)
@@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 
        sigemptyset(&sigign);
        sigemptyset(&sigcatch);
-       cutime = cstime = utime = stime = cputime_zero;
-       cgtime = gtime = cputime_zero;
+       cutime = cstime = utime = stime = 0;
+       cgtime = gtime = 0;
 
        if (lock_task_sighand(task, &flags)) {
                struct signal_struct *sig = task->signal;
@@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                               gtime = cputime_add(gtime, t->gtime);
+                               gtime += t->gtime;
                                t = next_thread(t);
                        } while (t != task);
 
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
                        thread_group_times(task, &utime, &stime);
-                       gtime = cputime_add(gtime, sig->gtime);
+                       gtime += sig->gtime;
                }
 
                sid = task_session_nr_ns(task, ns);
index 0855e6f..d76ca6a 100644 (file)
 #define arch_idle_time(cpu) 0
 #endif
 
-static cputime64_t get_idle_time(int cpu)
+static u64 get_idle_time(int cpu)
 {
-       u64 idle_time = get_cpu_idle_time_us(cpu, NULL);
-       cputime64_t idle;
+       u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
 
        if (idle_time == -1ULL) {
                /* !NO_HZ so we can rely on cpustat.idle */
-               idle = kstat_cpu(cpu).cpustat.idle;
-               idle = cputime64_add(idle, arch_idle_time(cpu));
+               idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+               idle += arch_idle_time(cpu);
        } else
                idle = usecs_to_cputime64(idle_time);
 
        return idle;
 }
 
-static cputime64_t get_iowait_time(int cpu)
+static u64 get_iowait_time(int cpu)
 {
-       u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
-       cputime64_t iowait;
+       u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
 
        if (iowait_time == -1ULL)
                /* !NO_HZ so we can rely on cpustat.iowait */
-               iowait = kstat_cpu(cpu).cpustat.iowait;
+               iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
        else
                iowait = usecs_to_cputime64(iowait_time);
 
@@ -55,33 +53,30 @@ static int show_stat(struct seq_file *p, void *v)
 {
        int i, j;
        unsigned long jif;
-       cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
-       cputime64_t guest, guest_nice;
+       u64 user, nice, system, idle, iowait, irq, softirq, steal;
+       u64 guest, guest_nice;
        u64 sum = 0;
        u64 sum_softirq = 0;
        unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
        struct timespec boottime;
 
        user = nice = system = idle = iowait =
-               irq = softirq = steal = cputime64_zero;
-       guest = guest_nice = cputime64_zero;
+               irq = softirq = steal = 0;
+       guest = guest_nice = 0;
        getboottime(&boottime);
        jif = boottime.tv_sec;
 
        for_each_possible_cpu(i) {
-               user = cputime64_add(user, kstat_cpu(i).cpustat.user);
-               nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
-               system = cputime64_add(system, kstat_cpu(i).cpustat.system);
-               idle = cputime64_add(idle, get_idle_time(i));
-               iowait = cputime64_add(iowait, get_iowait_time(i));
-               irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
-               softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
-               steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
-               guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-               guest_nice = cputime64_add(guest_nice,
-                       kstat_cpu(i).cpustat.guest_nice);
-               sum += kstat_cpu_irqs_sum(i);
-               sum += arch_irq_stat_cpu(i);
+               user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
+               nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+               system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
+               idle += get_idle_time(i);
+               iowait += get_iowait_time(i);
+               irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+               softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+               steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+               guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+               guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
 
                for (j = 0; j < NR_SOFTIRQS; j++) {
                        unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -106,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v)
                (unsigned long long)cputime64_to_clock_t(guest_nice));
        for_each_online_cpu(i) {
                /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
-               user = kstat_cpu(i).cpustat.user;
-               nice = kstat_cpu(i).cpustat.nice;
-               system = kstat_cpu(i).cpustat.system;
+               user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
+               nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
+               system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
                idle = get_idle_time(i);
                iowait = get_iowait_time(i);
-               irq = kstat_cpu(i).cpustat.irq;
-               softirq = kstat_cpu(i).cpustat.softirq;
-               steal = kstat_cpu(i).cpustat.steal;
-               guest = kstat_cpu(i).cpustat.guest;
-               guest_nice = kstat_cpu(i).cpustat.guest_nice;
+               irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
+               softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
+               steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
+               guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
+               guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
                seq_printf(p,
                        "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
                        "%llu\n",
index 766b1d4..9610ac7 100644 (file)
@@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 {
        struct timespec uptime;
        struct timespec idle;
+       u64 idletime;
+       u64 nsec;
+       u32 rem;
        int i;
-       cputime_t idletime = cputime_zero;
 
+       idletime = 0;
        for_each_possible_cpu(i)
-               idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
+               idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
        do_posix_clock_monotonic_gettime(&uptime);
        monotonic_to_bootbased(&uptime);
-       cputime_to_timespec(idletime, &idle);
+       nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
+       idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+       idle.tv_nsec = rem;
        seq_printf(m, "%lu.%02lu %lu.%02lu\n",
                        (unsigned long) uptime.tv_sec,
                        (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
index 12a1764..9a62937 100644 (file)
@@ -4,71 +4,66 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 
-typedef unsigned long cputime_t;
+typedef unsigned long __nocast cputime_t;
 
-#define cputime_zero                   (0UL)
 #define cputime_one_jiffy              jiffies_to_cputime(1)
-#define cputime_max                    ((~0UL >> 1) - 1)
-#define cputime_add(__a, __b)          ((__a) +  (__b))
-#define cputime_sub(__a, __b)          ((__a) -  (__b))
-#define cputime_div(__a, __n)          ((__a) /  (__n))
-#define cputime_halve(__a)             ((__a) >> 1)
-#define cputime_eq(__a, __b)           ((__a) == (__b))
-#define cputime_gt(__a, __b)           ((__a) >  (__b))
-#define cputime_ge(__a, __b)           ((__a) >= (__b))
-#define cputime_lt(__a, __b)           ((__a) <  (__b))
-#define cputime_le(__a, __b)           ((__a) <= (__b))
-#define cputime_to_jiffies(__ct)       (__ct)
+#define cputime_to_jiffies(__ct)       (__force unsigned long)(__ct)
 #define cputime_to_scaled(__ct)                (__ct)
-#define jiffies_to_cputime(__hz)       (__hz)
+#define jiffies_to_cputime(__hz)       (__force cputime_t)(__hz)
 
-typedef u64 cputime64_t;
+typedef u64 __nocast cputime64_t;
 
-#define cputime64_zero (0ULL)
-#define cputime64_add(__a, __b)                ((__a) + (__b))
-#define cputime64_sub(__a, __b)                ((__a) - (__b))
-#define cputime64_to_jiffies64(__ct)   (__ct)
-#define jiffies64_to_cputime64(__jif)  (__jif)
-#define cputime_to_cputime64(__ct)     ((u64) __ct)
-#define cputime64_gt(__a, __b)         ((__a) >  (__b))
+#define cputime64_to_jiffies64(__ct)   (__force u64)(__ct)
+#define jiffies64_to_cputime64(__jif)  (__force cputime64_t)(__jif)
 
-#define nsecs_to_cputime64(__ct)       nsecs_to_jiffies64(__ct)
+#define nsecs_to_cputime64(__ct)       \
+       jiffies64_to_cputime64(nsecs_to_jiffies64(__ct))
 
 
 /*
  * Convert cputime to microseconds and back.
  */
-#define cputime_to_usecs(__ct)         jiffies_to_usecs(__ct)
-#define usecs_to_cputime(__msecs)      usecs_to_jiffies(__msecs)
-#define usecs_to_cputime64(__msecs)    nsecs_to_jiffies64((__msecs) * 1000)
+#define cputime_to_usecs(__ct)         \
+       jiffies_to_usecs(cputime_to_jiffies(__ct))
+#define usecs_to_cputime(__usec)       \
+       jiffies_to_cputime(usecs_to_jiffies(__usec))
+#define usecs_to_cputime64(__usec)     \
+       jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000))
 
 /*
  * Convert cputime to seconds and back.
  */
-#define cputime_to_secs(jif)           ((jif) / HZ)
-#define secs_to_cputime(sec)           ((sec) * HZ)
+#define cputime_to_secs(jif)           (cputime_to_jiffies(jif) / HZ)
+#define secs_to_cputime(sec)           jiffies_to_cputime((sec) * HZ)
 
 /*
  * Convert cputime to timespec and back.
  */
-#define timespec_to_cputime(__val)     timespec_to_jiffies(__val)
-#define cputime_to_timespec(__ct,__val)        jiffies_to_timespec(__ct,__val)
+#define timespec_to_cputime(__val)     \
+       jiffies_to_cputime(timespec_to_jiffies(__val))
+#define cputime_to_timespec(__ct,__val)        \
+       jiffies_to_timespec(cputime_to_jiffies(__ct),__val)
 
 /*
  * Convert cputime to timeval and back.
  */
-#define timeval_to_cputime(__val)      timeval_to_jiffies(__val)
-#define cputime_to_timeval(__ct,__val) jiffies_to_timeval(__ct,__val)
+#define timeval_to_cputime(__val)      \
+       jiffies_to_cputime(timeval_to_jiffies(__val))
+#define cputime_to_timeval(__ct,__val) \
+       jiffies_to_timeval(cputime_to_jiffies(__ct),__val)
 
 /*
  * Convert cputime to clock and back.
  */
-#define cputime_to_clock_t(__ct)       jiffies_to_clock_t(__ct)
-#define clock_t_to_cputime(__x)                clock_t_to_jiffies(__x)
+#define cputime_to_clock_t(__ct)       \
+       jiffies_to_clock_t(cputime_to_jiffies(__ct))
+#define clock_t_to_cputime(__x)                \
+       jiffies_to_cputime(clock_t_to_jiffies(__x))
 
 /*
  * Convert cputime64 to clock.
  */
-#define cputime64_to_clock_t(__ct)     jiffies_64_to_clock_t(__ct)
+#define cputime64_to_clock_t(__ct)     \
+       jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
 
 #endif
index a3ef66a..3c1063a 100644 (file)
@@ -22,8 +22,14 @@ extern unsigned long __sw_hweight64(__u64 w);
 #include <asm/bitops.h>
 
 #define for_each_set_bit(bit, addr, size) \
-       for ((bit) = find_first_bit((addr), (size)); \
-            (bit) < (size); \
+       for ((bit) = find_first_bit((addr), (size));            \
+            (bit) < (size);                                    \
+            (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+/* same as for_each_set_bit() but use bit as value to start with */
+#define for_each_set_bit_cont(bit, addr, size) \
+       for ((bit) = find_next_bit((addr), (size), (bit));      \
+            (bit) < (size);                                    \
             (bit) = find_next_bit((addr), (size), (bit) + 1))
 
 static __inline__ int get_bitmask_order(unsigned int count)
index ab344a5..66d3e95 100644 (file)
@@ -44,7 +44,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
                                       unsigned long endpfn);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
-unsigned long free_all_memory_core_early(int nodeid);
+extern unsigned long free_low_memory_core_early(int nodeid);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
 
index 6cb60fd..305c263 100644 (file)
@@ -27,6 +27,7 @@ struct cpu {
 
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
+extern bool cpu_is_hotpluggable(unsigned cpu);
 
 extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
 extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
index 65970b8..0e5f578 100644 (file)
@@ -46,6 +46,8 @@ struct debug_obj {
  *                     fails
  * @fixup_free:                fixup function, which is called when the free check
  *                     fails
+ * @fixup_assert_init:  fixup function, which is called when the assert_init
+ *                     check fails
  */
 struct debug_obj_descr {
        const char              *name;
@@ -54,6 +56,7 @@ struct debug_obj_descr {
        int (*fixup_activate)   (void *addr, enum debug_obj_state state);
        int (*fixup_destroy)    (void *addr, enum debug_obj_state state);
        int (*fixup_free)       (void *addr, enum debug_obj_state state);
+       int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
 };
 
 #ifdef CONFIG_DEBUG_OBJECTS
@@ -64,6 +67,7 @@ extern void debug_object_activate  (void *addr, struct debug_obj_descr *descr);
 extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr);
 extern void debug_object_destroy   (void *addr, struct debug_obj_descr *descr);
 extern void debug_object_free      (void *addr, struct debug_obj_descr *descr);
+extern void debug_object_assert_init(void *addr, struct debug_obj_descr *descr);
 
 /*
  * Active state:
@@ -89,6 +93,8 @@ static inline void
 debug_object_destroy   (void *addr, struct debug_obj_descr *descr) { }
 static inline void
 debug_object_free      (void *addr, struct debug_obj_descr *descr) { }
+static inline void
+debug_object_assert_init(void *addr, struct debug_obj_descr *descr) { }
 
 static inline void debug_objects_early_init(void) { }
 static inline void debug_objects_mem_init(void) { }
index f743883..bb7f309 100644 (file)
@@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
 extern void account_system_vtime(struct task_struct *tsk);
 #endif
 
-#if defined(CONFIG_NO_HZ)
 #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-static inline void rcu_irq_enter(void)
-{
-       rcu_exit_nohz();
-}
-
-static inline void rcu_irq_exit(void)
-{
-       rcu_enter_nohz();
-}
 
 static inline void rcu_nmi_enter(void)
 {
@@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void)
 }
 
 #else
-extern void rcu_irq_enter(void);
-extern void rcu_irq_exit(void);
 extern void rcu_nmi_enter(void);
 extern void rcu_nmi_exit(void);
 #endif
-#else
-# define rcu_irq_enter() do { } while (0)
-# define rcu_irq_exit() do { } while (0)
-# define rcu_nmi_enter() do { } while (0)
-# define rcu_nmi_exit() do { } while (0)
-#endif /* #if defined(CONFIG_NO_HZ) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
index 388b0d4..5ce8b14 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/workqueue.h>
 
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
 
@@ -14,6 +15,12 @@ struct jump_label_key {
 #endif
 };
 
+struct jump_label_key_deferred {
+       struct jump_label_key key;
+       unsigned long timeout;
+       struct delayed_work work;
+};
+
 # include <asm/jump_label.h>
 # define HAVE_JUMP_LABEL
 #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
@@ -51,8 +58,11 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void jump_label_inc(struct jump_label_key *key);
 extern void jump_label_dec(struct jump_label_key *key);
+extern void jump_label_dec_deferred(struct jump_label_key_deferred *key);
 extern bool jump_label_enabled(struct jump_label_key *key);
 extern void jump_label_apply_nops(struct module *mod);
+extern void jump_label_rate_limit(struct jump_label_key_deferred *key,
+               unsigned long rl);
 
 #else  /* !HAVE_JUMP_LABEL */
 
@@ -68,6 +78,10 @@ static __always_inline void jump_label_init(void)
 {
 }
 
+struct jump_label_key_deferred {
+       struct jump_label_key  key;
+};
+
 static __always_inline bool static_branch(struct jump_label_key *key)
 {
        if (unlikely(atomic_read(&key->enabled)))
@@ -85,6 +99,11 @@ static inline void jump_label_dec(struct jump_label_key *key)
        atomic_dec(&key->enabled);
 }
 
+static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+{
+       jump_label_dec(&key->key);
+}
+
 static inline int jump_label_text_reserved(void *start, void *end)
 {
        return 0;
@@ -102,6 +121,14 @@ static inline int jump_label_apply_nops(struct module *mod)
 {
        return 0;
 }
+
+static inline void jump_label_rate_limit(struct jump_label_key_deferred *key,
+               unsigned long rl)
+{
+}
 #endif /* HAVE_JUMP_LABEL */
 
+#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), })
+#define jump_label_key_disabled        ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), })
+
 #endif /* _LINUX_JUMP_LABEL_H */
index 0cce2db..2fbd905 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <asm/irq.h>
 #include <asm/cputime.h>
 
  * used by rstatd/perfmeter
  */
 
-struct cpu_usage_stat {
-       cputime64_t user;
-       cputime64_t nice;
-       cputime64_t system;
-       cputime64_t softirq;
-       cputime64_t irq;
-       cputime64_t idle;
-       cputime64_t iowait;
-       cputime64_t steal;
-       cputime64_t guest;
-       cputime64_t guest_nice;
+enum cpu_usage_stat {
+       CPUTIME_USER,
+       CPUTIME_NICE,
+       CPUTIME_SYSTEM,
+       CPUTIME_SOFTIRQ,
+       CPUTIME_IRQ,
+       CPUTIME_IDLE,
+       CPUTIME_IOWAIT,
+       CPUTIME_STEAL,
+       CPUTIME_GUEST,
+       CPUTIME_GUEST_NICE,
+       NR_STATS,
+};
+
+struct kernel_cpustat {
+       u64 cpustat[NR_STATS];
 };
 
 struct kernel_stat {
-       struct cpu_usage_stat   cpustat;
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
@@ -38,10 +43,13 @@ struct kernel_stat {
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
+DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 
-#define kstat_cpu(cpu) per_cpu(kstat, cpu)
 /* Must have preemption disabled for this to be meaningful. */
-#define kstat_this_cpu __get_cpu_var(kstat)
+#define kstat_this_cpu (&__get_cpu_var(kstat))
+#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat))
+#define kstat_cpu(cpu) per_cpu(kstat, cpu)
+#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)
 
 extern unsigned long long nr_context_switches(void);
 
index b0e9989..e23121f 100644 (file)
@@ -10,6 +10,8 @@
 #define _INCLUDE_GUARD_LATENCYTOP_H_
 
 #include <linux/compiler.h>
+struct task_struct;
+
 #ifdef CONFIG_LATENCYTOP
 
 #define LT_SAVECOUNT           32
@@ -23,7 +25,6 @@ struct latency_record {
 };
 
 
-struct task_struct;
 
 extern int latencytop_enabled;
 void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
index b6a56e3..d36619e 100644 (file)
@@ -343,6 +343,8 @@ extern void lockdep_trace_alloc(gfp_t mask);
 
 #define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l))
 
+#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
+
 #else /* !LOCKDEP */
 
 static inline void lockdep_off(void)
@@ -392,6 +394,8 @@ struct lock_class_key { };
 
 #define lockdep_assert_held(l)                 do { } while (0)
 
+#define lockdep_recursing(tsk)                 (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_LOCK_STAT
index e6b843e..a6bb102 100644 (file)
@@ -2,8 +2,6 @@
 #define _LINUX_MEMBLOCK_H
 #ifdef __KERNEL__
 
-#define MEMBLOCK_ERROR 0
-
 #ifdef CONFIG_HAVE_MEMBLOCK
 /*
  * Logical memory blocks.
 #include <linux/init.h>
 #include <linux/mm.h>
 
-#include <asm/memblock.h>
-
 #define INIT_MEMBLOCK_REGIONS  128
 
 struct memblock_region {
        phys_addr_t base;
        phys_addr_t size;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+       int nid;
+#endif
 };
 
 struct memblock_type {
        unsigned long cnt;      /* number of regions */
        unsigned long max;      /* size of the allocated array */
+       phys_addr_t total_size; /* size of all regions */
        struct memblock_region *regions;
 };
 
 struct memblock {
        phys_addr_t current_limit;
-       phys_addr_t memory_size;        /* Updated by memblock_analyze() */
        struct memblock_type memory;
        struct memblock_type reserved;
 };
 
 extern struct memblock memblock;
 extern int memblock_debug;
-extern int memblock_can_resize;
 
 #define memblock_dbg(fmt, ...) \
        if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
-u64 memblock_find_in_range(u64 start, u64 end, u64 size, u64 align);
+phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end,
+                               phys_addr_t size, phys_addr_t align, int nid);
+phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
+                                  phys_addr_t size, phys_addr_t align);
 int memblock_free_reserved_regions(void);
 int memblock_reserve_reserved_regions(void);
 
-extern void memblock_init(void);
-extern void memblock_analyze(void);
-extern long memblock_add(phys_addr_t base, phys_addr_t size);
-extern long memblock_remove(phys_addr_t base, phys_addr_t size);
-extern long memblock_free(phys_addr_t base, phys_addr_t size);
-extern long memblock_reserve(phys_addr_t base, phys_addr_t size);
+void memblock_allow_resize(void);
+int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
+int memblock_add(phys_addr_t base, phys_addr_t size);
+int memblock_remove(phys_addr_t base, phys_addr_t size);
+int memblock_free(phys_addr_t base, phys_addr_t size);
+int memblock_reserve(phys_addr_t base, phys_addr_t size);
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
+                         unsigned long *out_end_pfn, int *out_nid);
+
+/**
+ * for_each_mem_pfn_range - early memory pfn range iterator
+ * @i: an integer used as loop variable
+ * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to ulong for start pfn of the range, can be %NULL
+ * @p_end: ptr to ulong for end pfn of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Walks over configured memory ranges.  Available after early_node_map is
+ * populated.
+ */
+#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid)          \
+       for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
+            i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
+                          phys_addr_t *out_end, int *out_nid);
+
+/**
+ * for_each_free_mem_range - iterate through free memblock areas
+ * @i: u64 used as loop variable
+ * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock.  Available as
+ * soon as memblock is initialized.
+ */
+#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid)         \
+       for (i = 0,                                                     \
+            __next_free_mem_range(&i, nid, p_start, p_end, p_nid);     \
+            i != (u64)ULLONG_MAX;                                      \
+            __next_free_mem_range(&i, nid, p_start, p_end, p_nid))
+
+void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
+                              phys_addr_t *out_end, int *out_nid);
 
-/* The numa aware allocator is only available if
- * CONFIG_ARCH_POPULATES_NODE_MAP is set
+/**
+ * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
+ * @i: u64 used as loop variable
+ * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in reverse
+ * order.  Available as soon as memblock is initialized.
  */
-extern phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align,
-                                       int nid);
-extern phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
-                                           int nid);
+#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \
+       for (i = (u64)ULLONG_MAX,                                       \
+            __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \
+            i != (u64)ULLONG_MAX;                                      \
+            __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid))
 
-extern phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid);
+
+static inline void memblock_set_region_node(struct memblock_region *r, int nid)
+{
+       r->nid = nid;
+}
+
+static inline int memblock_get_region_node(const struct memblock_region *r)
+{
+       return r->nid;
+}
+#else
+static inline void memblock_set_region_node(struct memblock_region *r, int nid)
+{
+}
+
+static inline int memblock_get_region_node(const struct memblock_region *r)
+{
+       return 0;
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid);
+phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
+
+phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
 
 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
 #define MEMBLOCK_ALLOC_ANYWHERE        (~(phys_addr_t)0)
 #define MEMBLOCK_ALLOC_ACCESSIBLE      0
 
-extern phys_addr_t memblock_alloc_base(phys_addr_t size,
-                                        phys_addr_t align,
-                                        phys_addr_t max_addr);
-extern phys_addr_t __memblock_alloc_base(phys_addr_t size,
-                                          phys_addr_t align,
-                                          phys_addr_t max_addr);
-extern phys_addr_t memblock_phys_mem_size(void);
-extern phys_addr_t memblock_start_of_DRAM(void);
-extern phys_addr_t memblock_end_of_DRAM(void);
-extern void memblock_enforce_memory_limit(phys_addr_t memory_limit);
-extern int memblock_is_memory(phys_addr_t addr);
-extern int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
-extern int memblock_is_reserved(phys_addr_t addr);
-extern int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
-
-extern void memblock_dump_all(void);
-
-/* Provided by the architecture */
-extern phys_addr_t memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid);
-extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
-                                  phys_addr_t addr2, phys_addr_t size2);
+phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align,
+                               phys_addr_t max_addr);
+phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
+                                 phys_addr_t max_addr);
+phys_addr_t memblock_phys_mem_size(void);
+phys_addr_t memblock_start_of_DRAM(void);
+phys_addr_t memblock_end_of_DRAM(void);
+void memblock_enforce_memory_limit(phys_addr_t memory_limit);
+int memblock_is_memory(phys_addr_t addr);
+int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
+int memblock_is_reserved(phys_addr_t addr);
+int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+
+extern void __memblock_dump_all(void);
+
+static inline void memblock_dump_all(void)
+{
+       if (memblock_debug)
+               __memblock_dump_all();
+}
 
 /**
  * memblock_set_current_limit - Set the current allocation limit to allow
@@ -101,7 +179,7 @@ extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
  *                         accessible during boot
  * @limit: New limit value (physical address)
  */
-extern void memblock_set_current_limit(phys_addr_t limit);
+void memblock_set_current_limit(phys_addr_t limit);
 
 
 /*
@@ -154,9 +232,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
             region++)
 
 
-#ifdef ARCH_DISCARD_MEMBLOCK
-#define __init_memblock __init
-#define __initdata_memblock __initdata
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#define __init_memblock __meminit
+#define __initdata_memblock __meminitdata
 #else
 #define __init_memblock
 #define __initdata_memblock
@@ -165,7 +243,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 #else
 static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
-       return MEMBLOCK_ERROR;
+       return 0;
 }
 
 #endif /* CONFIG_HAVE_MEMBLOCK */
index 4baadd1..5d9b4c9 100644 (file)
@@ -1253,41 +1253,34 @@ static inline void pgtable_page_dtor(struct page *page)
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
                unsigned long zone_start_pfn, unsigned long *zholes_size);
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
- * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
+ * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
  * zones, allocate the backing mem_map and account for memory holes in a more
  * architecture independent manner. This is a substitute for creating the
  * zone_sizes[] and zholes_size[] arrays and passing them to
  * free_area_init_node()
  *
  * An architecture is expected to register range of page frames backed by
- * physical memory with add_active_range() before calling
+ * physical memory with memblock_add[_node]() before calling
  * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
  * usage, an architecture is expected to do something like
  *
  * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
  *                                                      max_highmem_pfn};
  * for_each_valid_physical_page_range()
- *     add_active_range(node_id, start_pfn, end_pfn)
+ *     memblock_add_node(base, size, nid)
  * free_area_init_nodes(max_zone_pfns);
  *
- * If the architecture guarantees that there are no holes in the ranges
- * registered with add_active_range(), free_bootmem_active_regions()
- * will call free_bootmem_node() for each registered physical page range.
- * Similarly sparse_memory_present_with_active_regions() calls
- * memory_present() for each range when SPARSEMEM is enabled.
+ * free_bootmem_with_active_regions() calls free_bootmem_node() for each
+ * registered physical page range.  Similarly
+ * sparse_memory_present_with_active_regions() calls memory_present() for
+ * each range when SPARSEMEM is enabled.
  *
  * See mm/page_alloc.c for more information on each function exposed by
- * CONFIG_ARCH_POPULATES_NODE_MAP
+ * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
  */
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
-extern void add_active_range(unsigned int nid, unsigned long start_pfn,
-                                       unsigned long end_pfn);
-extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
-                                       unsigned long end_pfn);
-extern void remove_all_active_ranges(void);
-void sort_node_map(void);
 unsigned long node_map_pfn_alignment(void);
 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
                                                unsigned long end_pfn);
@@ -1300,14 +1293,11 @@ extern void free_bootmem_with_active_regions(int nid,
                                                unsigned long max_low_pfn);
 int add_from_early_node_map(struct range *range, int az,
                                   int nr_range, int nid);
-u64 __init find_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit);
-typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
-extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
-#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
     !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
 static inline int __early_pfn_to_nid(unsigned long pfn)
 {
index 188cb2f..3ac040f 100644 (file)
@@ -598,13 +598,13 @@ struct zonelist {
 #endif
 };
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 struct node_active_region {
        unsigned long start_pfn;
        unsigned long end_pfn;
        int nid;
 };
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
@@ -720,7 +720,7 @@ extern int movable_zone;
 
 static inline int zone_movable_is_highmem(void)
 {
-#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP)
+#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE)
        return movable_zone == ZONE_HIGHMEM;
 #else
        return 0;
@@ -938,7 +938,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #endif
 
 #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
-       !defined(CONFIG_ARCH_POPULATES_NODE_MAP)
+       !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
 static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 {
        return 0;
index b1f8912..0885561 100644 (file)
@@ -54,6 +54,7 @@ enum perf_hw_id {
        PERF_COUNT_HW_BUS_CYCLES                = 6,
        PERF_COUNT_HW_STALLED_CYCLES_FRONTEND   = 7,
        PERF_COUNT_HW_STALLED_CYCLES_BACKEND    = 8,
+       PERF_COUNT_HW_REF_CPU_CYCLES            = 9,
 
        PERF_COUNT_HW_MAX,                      /* non-ABI */
 };
@@ -890,6 +891,7 @@ struct perf_event_context {
        int                             nr_active;
        int                             is_active;
        int                             nr_stat;
+       int                             nr_freq;
        int                             rotate_disable;
        atomic_t                        refcount;
        struct task_struct              *task;
@@ -1063,12 +1065,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
        }
 }
 
-extern struct jump_label_key perf_sched_events;
+extern struct jump_label_key_deferred perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
 {
-       if (static_branch(&perf_sched_events))
+       if (static_branch(&perf_sched_events.key))
                __perf_event_task_sched_in(prev, task);
 }
 
@@ -1077,7 +1079,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 {
        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
 
-       if (static_branch(&perf_sched_events))
+       if (static_branch(&perf_sched_events.key))
                __perf_event_task_sched_out(prev, next);
 }
 
index 79159de..2110a81 100644 (file)
 #define        RED_INACTIVE    0x09F911029D74E35BULL   /* when obj is inactive */
 #define        RED_ACTIVE      0xD84156C5635688C0ULL   /* when obj is active */
 
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-#define MEMBLOCK_INACTIVE      0x3a84fb0144c9e71bULL
-#else
-#define MEMBLOCK_INACTIVE      0x44c9e71bUL
-#endif
-
 #define SLUB_RED_INACTIVE      0xbb
 #define SLUB_RED_ACTIVE                0xcc
 
index 2cf4226..81c04f4 100644 (file)
@@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 extern void rcutorture_record_test_transition(void);
 extern void rcutorture_record_progress(unsigned long vernum);
+extern void do_trace_rcu_torture_read(char *rcutorturename,
+                                     struct rcu_head *rhp);
 #else
 static inline void rcutorture_record_test_transition(void)
 {
@@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void)
 static inline void rcutorture_record_progress(unsigned long vernum)
 {
 }
+#ifdef CONFIG_RCU_TRACE
+extern void do_trace_rcu_torture_read(char *rcutorturename,
+                                     struct rcu_head *rhp);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
 #endif
 
 #define UINT_CMP_GE(a, b)      (UINT_MAX / 2 >= (a) - (b))
@@ -177,23 +185,10 @@ extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
 struct notifier_block;
-
-#ifdef CONFIG_NO_HZ
-
-extern void rcu_enter_nohz(void);
-extern void rcu_exit_nohz(void);
-
-#else /* #ifdef CONFIG_NO_HZ */
-
-static inline void rcu_enter_nohz(void)
-{
-}
-
-static inline void rcu_exit_nohz(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
+extern void rcu_idle_enter(void);
+extern void rcu_idle_exit(void);
+extern void rcu_irq_enter(void);
+extern void rcu_irq_exit(void);
 
 /*
  * Infrastructure to implement the synchronize_() primitives in
@@ -233,22 +228,30 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
-extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire() \
-               lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release()    lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#ifdef CONFIG_PROVE_RCU
+extern int rcu_is_cpu_idle(void);
+#else /* !CONFIG_PROVE_RCU */
+static inline int rcu_is_cpu_idle(void)
+{
+       return 0;
+}
+#endif /* else !CONFIG_PROVE_RCU */
 
-extern struct lockdep_map rcu_bh_lock_map;
-# define rcu_read_acquire_bh() \
-               lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_)
+static inline void rcu_lock_acquire(struct lockdep_map *map)
+{
+       WARN_ON_ONCE(rcu_is_cpu_idle());
+       lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
+}
 
-extern struct lockdep_map rcu_sched_lock_map;
-# define rcu_read_acquire_sched() \
-               lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define rcu_read_release_sched() \
-               lock_release(&rcu_sched_lock_map, 1, _THIS_IP_)
+static inline void rcu_lock_release(struct lockdep_map *map)
+{
+       WARN_ON_ONCE(rcu_is_cpu_idle());
+       lock_release(map, 1, _THIS_IP_);
+}
 
+extern struct lockdep_map rcu_lock_map;
+extern struct lockdep_map rcu_bh_lock_map;
+extern struct lockdep_map rcu_sched_lock_map;
 extern int debug_lockdep_rcu_enabled(void);
 
 /**
@@ -262,11 +265,18 @@ extern int debug_lockdep_rcu_enabled(void);
  *
  * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
+ *
+ * Note that rcu_read_lock() and the matching rcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock() in process context if the matching rcu_read_lock()
+ * was invoked from within an irq handler.
  */
 static inline int rcu_read_lock_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
+       if (rcu_is_cpu_idle())
+               return 0;
        return lock_is_held(&rcu_lock_map);
 }
 
@@ -290,6 +300,19 @@ extern int rcu_read_lock_bh_held(void);
  *
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock().  The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes.  This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
  */
 #ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
@@ -298,6 +321,8 @@ static inline int rcu_read_lock_sched_held(void)
 
        if (!debug_lockdep_rcu_enabled())
                return 1;
+       if (rcu_is_cpu_idle())
+               return 0;
        if (debug_locks)
                lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
        return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
@@ -311,12 +336,8 @@ static inline int rcu_read_lock_sched_held(void)
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-# define rcu_read_acquire()            do { } while (0)
-# define rcu_read_release()            do { } while (0)
-# define rcu_read_acquire_bh()         do { } while (0)
-# define rcu_read_release_bh()         do { } while (0)
-# define rcu_read_acquire_sched()      do { } while (0)
-# define rcu_read_release_sched()      do { } while (0)
+# define rcu_lock_acquire(a)           do { } while (0)
+# define rcu_lock_release(a)           do { } while (0)
 
 static inline int rcu_read_lock_held(void)
 {
@@ -637,7 +658,7 @@ static inline void rcu_read_lock(void)
 {
        __rcu_read_lock();
        __acquire(RCU);
-       rcu_read_acquire();
+       rcu_lock_acquire(&rcu_lock_map);
 }
 
 /*
@@ -657,7 +678,7 @@ static inline void rcu_read_lock(void)
  */
 static inline void rcu_read_unlock(void)
 {
-       rcu_read_release();
+       rcu_lock_release(&rcu_lock_map);
        __release(RCU);
        __rcu_read_unlock();
 }
@@ -673,12 +694,17 @@ static inline void rcu_read_unlock(void)
  * critical sections in interrupt context can use just rcu_read_lock(),
  * though this should at least be commented to avoid confusing people
  * reading the code.
+ *
+ * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
+ * must occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
+ * was invoked from some other task.
  */
 static inline void rcu_read_lock_bh(void)
 {
        local_bh_disable();
        __acquire(RCU_BH);
-       rcu_read_acquire_bh();
+       rcu_lock_acquire(&rcu_bh_lock_map);
 }
 
 /*
@@ -688,7 +714,7 @@ static inline void rcu_read_lock_bh(void)
  */
 static inline void rcu_read_unlock_bh(void)
 {
-       rcu_read_release_bh();
+       rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
 }
@@ -700,12 +726,17 @@ static inline void rcu_read_unlock_bh(void)
  * are being done using call_rcu_sched() or synchronize_rcu_sched().
  * Read-side critical sections can also be introduced by anything that
  * disables preemption, including local_irq_disable() and friends.
+ *
+ * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
+ * must occur in the same context, for example, it is illegal to invoke
+ * rcu_read_unlock_sched() from process context if the matching
+ * rcu_read_lock_sched() was invoked from an NMI handler.
  */
 static inline void rcu_read_lock_sched(void)
 {
        preempt_disable();
        __acquire(RCU_SCHED);
-       rcu_read_acquire_sched();
+       rcu_lock_acquire(&rcu_sched_lock_map);
 }
 
 /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -722,7 +753,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
  */
 static inline void rcu_read_unlock_sched(void)
 {
-       rcu_read_release_sched();
+       rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
 }
index 1c4f3e9..cf0eb34 100644 (file)
@@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern void select_nohz_load_balancer(int stop_tick);
+extern void set_cpu_sd_state_idle(void);
 extern int get_nohz_timer_target(void);
 #else
 static inline void select_nohz_load_balancer(int stop_tick) { }
+static inline void set_cpu_sd_state_idle(void) { }
 #endif
 
 /*
@@ -483,8 +485,8 @@ struct task_cputime {
 
 #define INIT_CPUTIME   \
        (struct task_cputime) {                                 \
-               .utime = cputime_zero,                          \
-               .stime = cputime_zero,                          \
+               .utime = 0,                                     \
+               .stime = 0,                                     \
                .sum_exec_runtime = 0,                          \
        }
 
@@ -901,6 +903,10 @@ struct sched_group_power {
         * single CPU.
         */
        unsigned int power, power_orig;
+       /*
+        * Number of busy cpus in this group.
+        */
+       atomic_t nr_busy_cpus;
 };
 
 struct sched_group {
@@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
        return to_cpumask(sg->cpumask);
 }
 
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+       return cpumask_first(sched_group_cpus(group));
+}
+
 struct sched_domain_attr {
        int relax_domain_level;
 };
@@ -1315,8 +1330,8 @@ struct task_struct {
         * older sibling, respectively.  (p->father can be replaced with 
         * p->real_parent->pid)
         */
-       struct task_struct *real_parent; /* real parent process */
-       struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
+       struct task_struct __rcu *real_parent; /* real parent process */
+       struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
        /*
         * children/sibling forms the list of my natural children
         */
@@ -2070,6 +2085,14 @@ extern int sched_setscheduler(struct task_struct *, int,
 extern int sched_setscheduler_nocheck(struct task_struct *, int,
                                      const struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
+/**
+ * is_idle_task - is the specified task an idle task?
+ * @tsk: the task in question.
+ */
+static inline bool is_idle_task(struct task_struct *p)
+{
+       return p->pid == 0;
+}
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
 
index 58971e8..e1b0059 100644 (file)
@@ -28,6 +28,7 @@
 #define _LINUX_SRCU_H
 
 #include <linux/mutex.h>
+#include <linux/rcupdate.h>
 
 struct srcu_struct_array {
        int c[2];
@@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
        __init_srcu_struct((sp), #sp, &__srcu_key); \
 })
 
-# define srcu_read_acquire(sp) \
-               lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_)
-# define srcu_read_release(sp) \
-               lock_release(&(sp)->dep_map, 1, _THIS_IP_)
-
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 int init_srcu_struct(struct srcu_struct *sp);
 
-# define srcu_read_acquire(sp)  do { } while (0)
-# define srcu_read_release(sp)  do { } while (0)
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 void cleanup_srcu_struct(struct srcu_struct *sp);
@@ -90,12 +83,32 @@ long srcu_batches_completed(struct srcu_struct *sp);
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
  * this assumes we are in an SRCU read-side critical section unless it can
  * prove otherwise.
+ *
+ * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of view
+ * (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
+ * the CPU did an srcu_read_lock().  The reason for this is that RCU
+ * ignores CPUs that are in such a section, considering these as in
+ * extended quiescent state, so such a CPU is effectively never in an
+ * RCU read-side critical section regardless of what RCU primitives it
+ * invokes.  This state of affairs is required --- we need to keep an
+ * RCU-free window in idle where the CPU may possibly enter into low
+ * power mode. This way we can notice an extended quiescent state to
+ * other CPUs that started a grace period. Otherwise we would delay any
+ * grace period as long as we run in the idle task.
  */
 static inline int srcu_read_lock_held(struct srcu_struct *sp)
 {
-       if (debug_locks)
-               return lock_is_held(&sp->dep_map);
-       return 1;
+       if (rcu_is_cpu_idle())
+               return 0;
+
+       if (!debug_lockdep_rcu_enabled())
+               return 1;
+
+       return lock_is_held(&sp->dep_map);
 }
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -145,12 +158,17 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  * one way to indirectly wait on an SRCU grace period is to acquire
  * a mutex that is held elsewhere while calling synchronize_srcu() or
  * synchronize_srcu_expedited().
+ *
+ * Note that srcu_read_lock() and the matching srcu_read_unlock() must
+ * occur in the same context, for example, it is illegal to invoke
+ * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
+ * was invoked in process context.
  */
 static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 {
        int retval = __srcu_read_lock(sp);
 
-       srcu_read_acquire(sp);
+       rcu_lock_acquire(&(sp)->dep_map);
        return retval;
 }
 
@@ -164,8 +182,51 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
 static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
        __releases(sp)
 {
-       srcu_read_release(sp);
+       rcu_lock_release(&(sp)->dep_map);
+       __srcu_read_unlock(sp, idx);
+}
+
+/**
+ * srcu_read_lock_raw - register a new reader for an SRCU-protected structure.
+ * @sp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section.  Similar to srcu_read_lock(),
+ * but avoids the RCU-lockdep checking.  This means that it is legal to
+ * use srcu_read_lock_raw() in one context, for example, in an exception
+ * handler, and then have the matching srcu_read_unlock_raw() in another
+ * context, for example in the task that took the exception.
+ *
+ * However, the entire SRCU read-side critical section must reside within a
+ * single task.  For example, beware of using srcu_read_lock_raw() in
+ * a device interrupt handler and srcu_read_unlock() in the interrupted
+ * task:  This will not work if interrupts are threaded.
+ */
+static inline int srcu_read_lock_raw(struct srcu_struct *sp)
+{
+       unsigned long flags;
+       int ret;
+
+       local_irq_save(flags);
+       ret =  __srcu_read_lock(sp);
+       local_irq_restore(flags);
+       return ret;
+}
+
+/**
+ * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure.
+ * @sp: srcu_struct in which to unregister the old reader.
+ * @idx: return value from corresponding srcu_read_lock_raw().
+ *
+ * Exit an SRCU read-side critical section without lockdep-RCU checking.
+ * See srcu_read_lock_raw() for more details.
+ */
+static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
        __srcu_read_unlock(sp, idx);
+       local_irq_restore(flags);
 }
 
 #endif
index b232ccc..ab8be90 100644 (file)
@@ -7,6 +7,7 @@
 #define _LINUX_TICK_H
 
 #include <linux/clockchips.h>
+#include <linux/irqflags.h>
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 
@@ -121,14 +122,16 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
-extern void tick_nohz_stop_sched_tick(int inidle);
-extern void tick_nohz_restart_sched_tick(void);
+extern void tick_nohz_idle_enter(void);
+extern void tick_nohz_idle_exit(void);
+extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
-static inline void tick_nohz_stop_sched_tick(int inidle) { }
-static inline void tick_nohz_restart_sched_tick(void) { }
+static inline void tick_nohz_idle_enter(void) { }
+static inline void tick_nohz_idle_exit(void) { }
+
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
        ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
index 3efc9f3..a9ce45e 100644 (file)
@@ -77,13 +77,13 @@ struct task_struct;
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)                          \
        { .flags = word, .bit_nr = bit, }
 
-extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *);
+extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
 
 #define init_waitqueue_head(q)                         \
        do {                                            \
                static struct lock_class_key __key;     \
                                                        \
-               __init_waitqueue_head((q), &__key);     \
+               __init_waitqueue_head((q), #q, &__key); \
        } while (0)
 
 #ifdef CONFIG_LOCKDEP
index 669fbd6..d2d88be 100644 (file)
@@ -241,24 +241,73 @@ TRACE_EVENT(rcu_fqs,
 
 /*
  * Tracepoint for dyntick-idle entry/exit events.  These take a string
- * as argument: "Start" for entering dyntick-idle mode and "End" for
- * leaving it.
+ * as argument: "Start" for entering dyntick-idle mode, "End" for
+ * leaving it, "--=" for events moving towards idle, and "++=" for events
+ * moving away from idle.  "Error on entry: not idle task" and "Error on
+ * exit: not idle task" indicate that a non-idle task is erroneously
+ * toying with the idle loop.
+ *
+ * These events also take a pair of numbers, which indicate the nesting
+ * depth before and after the event of interest.  Note that task-related
+ * events use the upper bits of each number, while interrupt-related
+ * events use the lower bits.
  */
 TRACE_EVENT(rcu_dyntick,
 
-       TP_PROTO(char *polarity),
+       TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
 
-       TP_ARGS(polarity),
+       TP_ARGS(polarity, oldnesting, newnesting),
 
        TP_STRUCT__entry(
                __field(char *, polarity)
+               __field(long long, oldnesting)
+               __field(long long, newnesting)
        ),
 
        TP_fast_assign(
                __entry->polarity = polarity;
+               __entry->oldnesting = oldnesting;
+               __entry->newnesting = newnesting;
+       ),
+
+       TP_printk("%s %llx %llx", __entry->polarity,
+                 __entry->oldnesting, __entry->newnesting)
+);
+
+/*
+ * Tracepoint for RCU preparation for idle, the goal being to get RCU
+ * processing done so that the current CPU can shut off its scheduling
+ * clock and enter dyntick-idle mode.  One way to accomplish this is
+ * to drain all RCU callbacks from this CPU, and the other is to have
+ * done everything RCU requires for the current grace period.  In this
+ * latter case, the CPU will be awakened at the end of the current grace
+ * period in order to process the remainder of its callbacks.
+ *
+ * These tracepoints take a string as argument:
+ *
+ *     "No callbacks": Nothing to do, no callbacks on this CPU.
+ *     "In holdoff": Nothing to do, holding off after unsuccessful attempt.
+ *     "Begin holdoff": Attempt failed, don't retry until next jiffy.
+ *     "Dyntick with callbacks": Entering dyntick-idle despite callbacks.
+ *     "More callbacks": Still more callbacks, try again to clear them out.
+ *     "Callbacks drained": All callbacks processed, off to dyntick idle!
+ *     "Timer": Timer fired to cause CPU to continue processing callbacks.
+ */
+TRACE_EVENT(rcu_prep_idle,
+
+       TP_PROTO(char *reason),
+
+       TP_ARGS(reason),
+
+       TP_STRUCT__entry(
+               __field(char *, reason)
+       ),
+
+       TP_fast_assign(
+               __entry->reason = reason;
        ),
 
-       TP_printk("%s", __entry->polarity)
+       TP_printk("%s", __entry->reason)
 );
 
 /*
@@ -412,27 +461,71 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
 
 /*
  * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
- * invoked.  The first argument is the name of the RCU flavor and
- * the second argument is number of callbacks actually invoked.
+ * invoked.  The first argument is the name of the RCU flavor,
+ * the second argument is number of callbacks actually invoked,
+ * the third argument (cb) is whether or not any of the callbacks that
+ * were ready to invoke at the beginning of this batch are still
+ * queued, the fourth argument (nr) is the return value of need_resched(),
+ * the fifth argument (iit) is 1 if the current task is the idle task,
+ * and the sixth argument (risk) is the return value from
+ * rcu_is_callbacks_kthread().
  */
 TRACE_EVENT(rcu_batch_end,
 
-       TP_PROTO(char *rcuname, int callbacks_invoked),
+       TP_PROTO(char *rcuname, int callbacks_invoked,
+                bool cb, bool nr, bool iit, bool risk),
 
-       TP_ARGS(rcuname, callbacks_invoked),
+       TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
 
        TP_STRUCT__entry(
                __field(char *, rcuname)
                __field(int, callbacks_invoked)
+               __field(bool, cb)
+               __field(bool, nr)
+               __field(bool, iit)
+               __field(bool, risk)
        ),
 
        TP_fast_assign(
                __entry->rcuname = rcuname;
                __entry->callbacks_invoked = callbacks_invoked;
+               __entry->cb = cb;
+               __entry->nr = nr;
+               __entry->iit = iit;
+               __entry->risk = risk;
+       ),
+
+       TP_printk("%s CBs-invoked=%d idle=%c%c%c%c",
+                 __entry->rcuname, __entry->callbacks_invoked,
+                 __entry->cb ? 'C' : '.',
+                 __entry->nr ? 'S' : '.',
+                 __entry->iit ? 'I' : '.',
+                 __entry->risk ? 'R' : '.')
+);
+
+/*
+ * Tracepoint for rcutorture readers.  The first argument is the name
+ * of the RCU flavor from rcutorture's viewpoint and the second argument
+ * is the callback address.
+ */
+TRACE_EVENT(rcu_torture_read,
+
+       TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
+
+       TP_ARGS(rcutorturename, rhp),
+
+       TP_STRUCT__entry(
+               __field(char *, rcutorturename)
+               __field(struct rcu_head *, rhp)
+       ),
+
+       TP_fast_assign(
+               __entry->rcutorturename = rcutorturename;
+               __entry->rhp = rhp;
        ),
 
-       TP_printk("%s CBs-invoked=%d",
-                 __entry->rcuname, __entry->callbacks_invoked)
+       TP_printk("%s torture read %p",
+                 __entry->rcutorturename, __entry->rhp)
 );
 
 #else /* #ifdef CONFIG_RCU_TRACE */
@@ -443,13 +536,16 @@ TRACE_EVENT(rcu_batch_end,
 #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
 #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
 #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
-#define trace_rcu_dyntick(polarity) do { } while (0)
+#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
+#define trace_rcu_prep_idle(reason) do { } while (0)
 #define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
 #define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
 #define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
 #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
 #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
-#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0)
+#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
+       do { } while (0)
+#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
index 959ff18..6ba596b 100644 (file)
@@ -330,6 +330,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
             TP_PROTO(struct task_struct *tsk, u64 delay),
             TP_ARGS(tsk, delay));
 
+/*
+ * Tracepoint for accounting blocked time (time the task is in uninterruptible).
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
+            TP_PROTO(struct task_struct *tsk, u64 delay),
+            TP_ARGS(tsk, delay));
+
 /*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
@@ -363,6 +370,56 @@ TRACE_EVENT(sched_stat_runtime,
                        (unsigned long long)__entry->vruntime)
 );
 
+#ifdef CREATE_TRACE_POINTS
+static inline u64 trace_get_sleeptime(struct task_struct *tsk)
+{
+#ifdef CONFIG_SCHEDSTATS
+       u64 block, sleep;
+
+       block = tsk->se.statistics.block_start;
+       sleep = tsk->se.statistics.sleep_start;
+       tsk->se.statistics.block_start = 0;
+       tsk->se.statistics.sleep_start = 0;
+
+       return block ? block : sleep ? sleep : 0;
+#else
+       return 0;
+#endif
+}
+#endif
+
+/*
+ * Tracepoint for accounting sleeptime (time the task is sleeping
+ * or waiting for I/O).
+ */
+TRACE_EVENT(sched_stat_sleeptime,
+
+       TP_PROTO(struct task_struct *tsk, u64 now),
+
+       TP_ARGS(tsk, now),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN   )
+               __field( pid_t, pid                     )
+               __field( u64,   sleeptime               )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid            = tsk->pid;
+               __entry->sleeptime = trace_get_sleeptime(tsk);
+               __entry->sleeptime = __entry->sleeptime ?
+                               now - __entry->sleeptime : 0;
+       )
+       TP_perf_assign(
+               __perf_count(__entry->sleeptime);
+       ),
+
+       TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]",
+                       __entry->comm, __entry->pid,
+                       (unsigned long long)__entry->sleeptime)
+);
+
 /*
  * Tracepoint for showing priority inheritance modifying a tasks
  * priority.
index b8930d5..a075765 100644 (file)
@@ -469,14 +469,14 @@ config RCU_FANOUT_EXACT
 
 config RCU_FAST_NO_HZ
        bool "Accelerate last non-dyntick-idle CPU's grace periods"
-       depends on TREE_RCU && NO_HZ && SMP
+       depends on NO_HZ && SMP
        default n
        help
          This option causes RCU to attempt to accelerate grace periods
-         in order to allow the final CPU to enter dynticks-idle state
-         more quickly.  On the other hand, this option increases the
-         overhead of the dynticks-idle checking, particularly on systems
-         with large numbers of CPUs.
+         in order to allow CPUs to enter dynticks-idle state more
+         quickly.  On the other hand, this option increases the overhead
+         of the dynticks-idle checking, particularly on systems with
+         large numbers of CPUs.
 
          Say Y if energy efficiency is critically important, particularly
                if you have relatively few CPUs.
index 217ed23..2c76efb 100644 (file)
@@ -469,13 +469,12 @@ asmlinkage void __init start_kernel(void)
        char * command_line;
        extern const struct kernel_param __start___param[], __stop___param[];
 
-       smp_setup_processor_id();
-
        /*
         * Need to run as early as possible, to initialize the
         * lockdep hash:
         */
        lockdep_init();
+       smp_setup_processor_id();
        debug_objects_early_init();
 
        /*
index e898c5b..f70396e 100644 (file)
@@ -2,16 +2,15 @@
 # Makefile for the linux kernel.
 #
 
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y     = fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-           notifier.o ksysfs.o sched_clock.o cred.o \
-           async.o range.o
-obj-y += groups.o
+           notifier.o ksysfs.o cred.o \
+           async.o range.o groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
 CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
+obj-y += sched/
+
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
 
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only.  Why this used to be enabled for all architectures is beyond
-# me.  I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
-
 $(obj)/configs.o: $(obj)/config_data.h
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
index fa7eb3d..203dfea 100644 (file)
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-       pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
-       pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+       pacct->ac_utime += current->utime;
+       pacct->ac_stime += current->stime;
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
index 563f136..5ca38d5 100644 (file)
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                   (!cputime_eq(p->utime, cputime_zero) ||
-                    !cputime_eq(p->stime, cputime_zero)))
+                   (p->utime || p->stime))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
        cpu_maps_update_done();
        return err;
 }
+EXPORT_SYMBOL_GPL(cpu_up);
 
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
index 5532dd3..7d6fb40 100644 (file)
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
                (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
                (p->exit_state & EXIT_DEAD) ? 'E' :
                (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
-       if (p->pid == 0) {
+       if (is_idle_task(p)) {
                /* Idle task.  Is it really idle, apart from the kdb
                 * interrupt? */
                if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
index 89e5e8a..22d901f 100644 (file)
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
 
-obj-y := core.o ring_buffer.o
+obj-y := core.o ring_buffer.o callchain.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644 (file)
index 0000000..057e24b
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  Â©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct callchain_cpus_entries {
+       struct rcu_head                 rcu_head;
+       struct perf_callchain_entry     *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+       struct callchain_cpus_entries *entries;
+       int cpu;
+
+       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+
+       kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+       struct callchain_cpus_entries *entries;
+
+       entries = callchain_cpus_entries;
+       rcu_assign_pointer(callchain_cpus_entries, NULL);
+       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+       int cpu;
+       int size;
+       struct callchain_cpus_entries *entries;
+
+       /*
+        * We can't use the percpu allocation API for data that can be
+        * accessed from NMI. Use a temporary manual per cpu allocation
+        * until that gets sorted out.
+        */
+       size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+
+       entries = kzalloc(size, GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
+
+       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+       for_each_possible_cpu(cpu) {
+               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                        cpu_to_node(cpu));
+               if (!entries->cpu_entries[cpu])
+                       goto fail;
+       }
+
+       rcu_assign_pointer(callchain_cpus_entries, entries);
+
+       return 0;
+
+fail:
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+       kfree(entries);
+
+       return -ENOMEM;
+}
+
+int get_callchain_buffers(void)
+{
+       int err = 0;
+       int count;
+
+       mutex_lock(&callchain_mutex);
+
+       count = atomic_inc_return(&nr_callchain_events);
+       if (WARN_ON_ONCE(count < 1)) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       if (count > 1) {
+               /* If the allocation failed, give up */
+               if (!callchain_cpus_entries)
+                       err = -ENOMEM;
+               goto exit;
+       }
+
+       err = alloc_callchain_buffers();
+       if (err)
+               release_callchain_buffers();
+exit:
+       mutex_unlock(&callchain_mutex);
+
+       return err;
+}
+
+void put_callchain_buffers(void)
+{
+       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+               release_callchain_buffers();
+               mutex_unlock(&callchain_mutex);
+       }
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+       int cpu;
+       struct callchain_cpus_entries *entries;
+
+       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+       if (*rctx == -1)
+               return NULL;
+
+       entries = rcu_dereference(callchain_cpus_entries);
+       if (!entries)
+               return NULL;
+
+       cpu = smp_processor_id();
+
+       return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       int rctx;
+       struct perf_callchain_entry *entry;
+
+
+       entry = get_callchain_entry(&rctx);
+       if (rctx == -1)
+               return NULL;
+
+       if (!entry)
+               goto exit_put;
+
+       entry->nr = 0;
+
+       if (!user_mode(regs)) {
+               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(entry, regs);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               perf_callchain_store(entry, PERF_CONTEXT_USER);
+               perf_callchain_user(entry, regs);
+       }
+
+exit_put:
+       put_callchain_entry(rctx);
+
+       return entry;
+}
index 58690af..890eb02 100644 (file)
@@ -128,7 +128,7 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct jump_label_key perf_sched_events __read_mostly;
+struct jump_label_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        ctx->nr_active--;
+       if (event->attr.freq && event->attr.sample_freq)
+               ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 }
@@ -1325,6 +1327,7 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_disable);
 
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
+       if (event->attr.freq && event->attr.sample_freq)
+               ctx->nr_freq++;
 
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
  * Note: this works for group members as well as group leaders
  * since the non-leader members' sibling_lists will be empty.
  */
-static void __perf_event_mark_enabled(struct perf_event *event,
-                                       struct perf_event_context *ctx)
+static void __perf_event_mark_enabled(struct perf_event *event)
 {
        struct perf_event *sub;
        u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
         */
        perf_cgroup_set_timestamp(current, ctx);
 
-       __perf_event_mark_enabled(event, ctx);
+       __perf_event_mark_enabled(event);
 
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
 
 retry:
        if (!ctx->is_active) {
-               __perf_event_mark_enabled(event, ctx);
+               __perf_event_mark_enabled(event);
                goto out;
        }
 
@@ -1809,6 +1813,7 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_enable);
 
 int perf_event_refresh(struct perf_event *event, int refresh)
 {
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
 
+       if (!ctx->nr_freq)
+               return;
+
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
        struct perf_event_context *ctx = NULL;
-       int rotate = 0, remove = 1;
+       int rotate = 0, remove = 1, freq = 0;
 
        if (cpuctx->ctx.nr_events) {
                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
+               if (cpuctx->ctx.nr_freq)
+                       freq = 1;
        }
 
        ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
+               if (ctx->nr_freq)
+                       freq = 1;
        }
 
+       if (!rotate && !freq)
+               goto done;
+
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
-       perf_ctx_adjust_freq(&cpuctx->ctx, interval);
-       if (ctx)
-               perf_ctx_adjust_freq(ctx, interval);
 
-       if (!rotate)
-               goto done;
+       if (freq) {
+               perf_ctx_adjust_freq(&cpuctx->ctx, interval);
+               if (ctx)
+                       perf_ctx_adjust_freq(ctx, interval);
+       }
 
-       cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-       if (ctx)
-               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+       if (rotate) {
+               cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+               if (ctx)
+                       ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 
-       rotate_ctx(&cpuctx->ctx);
-       if (ctx)
-               rotate_ctx(ctx);
+               rotate_ctx(&cpuctx->ctx);
+               if (ctx)
+                       rotate_ctx(ctx);
 
-       perf_event_sched_in(cpuctx, ctx, current);
+               perf_event_sched_in(cpuctx, ctx, current);
+       }
+
+       perf_pmu_enable(cpuctx->ctx.pmu);
+       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
-
-       perf_pmu_enable(cpuctx->ctx.pmu);
-       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 
 void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
 
-       __perf_event_mark_enabled(event, ctx);
+       __perf_event_mark_enabled(event);
 
        return 1;
 }
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
 
-       list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
-               ret = event_enable_on_exec(event, ctx);
-               if (ret)
-                       enabled = 1;
-       }
-
-       list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+       list_for_each_entry(event, &ctx->event_list, event_entry) {
                ret = event_enable_on_exec(event, ctx);
                if (ret)
                        enabled = 1;
@@ -2573,215 +2584,6 @@ static u64 perf_event_read(struct perf_event *event)
        return perf_event_count(event);
 }
 
-/*
- * Callchain support
- */
-
-struct callchain_cpus_entries {
-       struct rcu_head                 rcu_head;
-       struct perf_callchain_entry     *cpu_entries[0];
-};
-
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-struct callchain_cpus_entries *callchain_cpus_entries;
-
-
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                 struct pt_regs *regs)
-{
-}
-
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                               struct pt_regs *regs)
-{
-}
-
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
-       struct callchain_cpus_entries *entries;
-       int cpu;
-
-       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-
-       for_each_possible_cpu(cpu)
-               kfree(entries->cpu_entries[cpu]);
-
-       kfree(entries);
-}
-
-static void release_callchain_buffers(void)
-{
-       struct callchain_cpus_entries *entries;
-
-       entries = callchain_cpus_entries;
-       rcu_assign_pointer(callchain_cpus_entries, NULL);
-       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-
-static int alloc_callchain_buffers(void)
-{
-       int cpu;
-       int size;
-       struct callchain_cpus_entries *entries;
-
-       /*
-        * We can't use the percpu allocation API for data that can be
-        * accessed from NMI. Use a temporary manual per cpu allocation
-        * until that gets sorted out.
-        */
-       size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-
-       entries = kzalloc(size, GFP_KERNEL);
-       if (!entries)
-               return -ENOMEM;
-
-       size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-
-       for_each_possible_cpu(cpu) {
-               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
-                                                        cpu_to_node(cpu));
-               if (!entries->cpu_entries[cpu])
-                       goto fail;
-       }
-
-       rcu_assign_pointer(callchain_cpus_entries, entries);
-
-       return 0;
-
-fail:
-       for_each_possible_cpu(cpu)
-               kfree(entries->cpu_entries[cpu]);
-       kfree(entries);
-
-       return -ENOMEM;
-}
-
-static int get_callchain_buffers(void)
-{
-       int err = 0;
-       int count;
-
-       mutex_lock(&callchain_mutex);
-
-       count = atomic_inc_return(&nr_callchain_events);
-       if (WARN_ON_ONCE(count < 1)) {
-               err = -EINVAL;
-               goto exit;
-       }
-
-       if (count > 1) {
-               /* If the allocation failed, give up */
-               if (!callchain_cpus_entries)
-                       err = -ENOMEM;
-               goto exit;
-       }
-
-       err = alloc_callchain_buffers();
-       if (err)
-               release_callchain_buffers();
-exit:
-       mutex_unlock(&callchain_mutex);
-
-       return err;
-}
-
-static void put_callchain_buffers(void)
-{
-       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
-               release_callchain_buffers();
-               mutex_unlock(&callchain_mutex);
-       }
-}
-
-static int get_recursion_context(int *recursion)
-{
-       int rctx;
-
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (recursion[rctx])
-               return -1;
-
-       recursion[rctx]++;
-       barrier();
-
-       return rctx;
-}
-
-static inline void put_recursion_context(int *recursion, int rctx)
-{
-       barrier();
-       recursion[rctx]--;
-}
-
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
-       int cpu;
-       struct callchain_cpus_entries *entries;
-
-       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
-       if (*rctx == -1)
-               return NULL;
-
-       entries = rcu_dereference(callchain_cpus_entries);
-       if (!entries)
-               return NULL;
-
-       cpu = smp_processor_id();
-
-       return &entries->cpu_entries[cpu][*rctx];
-}
-
-static void
-put_callchain_entry(int rctx)
-{
-       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       int rctx;
-       struct perf_callchain_entry *entry;
-
-
-       entry = get_callchain_entry(&rctx);
-       if (rctx == -1)
-               return NULL;
-
-       if (!entry)
-               goto exit_put;
-
-       entry->nr = 0;
-
-       if (!user_mode(regs)) {
-               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-               perf_callchain_kernel(entry, regs);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               perf_callchain_store(entry, PERF_CONTEXT_USER);
-               perf_callchain_user(entry, regs);
-       }
-
-exit_put:
-       put_callchain_entry(rctx);
-
-       return entry;
-}
-
 /*
  * Initialize the perf_event context in a task_struct:
  */
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_dec(&perf_sched_events);
+                       jump_label_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                       jump_label_dec(&perf_sched_events);
+                       jump_label_dec_deferred(&perf_sched_events);
                }
        }
 
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
 
-       data->period = event->hw.last_period;
        if (!overflow)
                overflow = perf_swevent_set_period(event);
 
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!is_sampling_event(event))
                return;
 
+       if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+               data->period = nr;
+               return perf_swevent_overflow(event, 1, data, regs);
+       } else
+               data->period = event->hw.last_period;
+
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);
 
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        regs = get_irq_regs();
 
        if (regs && !perf_exclude_event(event, regs)) {
-               if (!(event->attr.exclude_idle && current->pid == 0))
+               if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5981,7 +5788,7 @@ done:
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_inc(&perf_sched_events);
+                       jump_label_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-               jump_label_inc(&perf_sched_events);
+               jump_label_inc(&perf_sched_events.key);
        }
 
        /*
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void)
 
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+
+       /* do not patch jump label more than once per second */
+       jump_label_rate_limit(&perf_sched_events, HZ);
 }
 
 static int __init perf_event_sysfs_init(void)
index 64568a6..b0b107f 100644 (file)
@@ -1,6 +1,10 @@
 #ifndef _KERNEL_EVENTS_INTERNAL_H
 #define _KERNEL_EVENTS_INTERNAL_H
 
+#include <linux/hardirq.h>
+
+/* Buffer handling */
+
 #define RING_BUFFER_WRITABLE           0x01
 
 struct ring_buffer {
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
 }
 #endif
 
-static unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
 {
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
        } while (len);
 }
 
+/* Callchain handling */
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+
+static inline int get_recursion_context(int *recursion)
+{
+       int rctx;
+
+       if (in_nmi())
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (recursion[rctx])
+               return -1;
+
+       recursion[rctx]++;
+       barrier();
+
+       return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+       barrier();
+       recursion[rctx]--;
+}
+
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
index e6e01b9..d579a45 100644 (file)
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-               sig->utime = cputime_add(sig->utime, tsk->utime);
-               sig->stime = cputime_add(sig->stime, tsk->stime);
-               sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+               sig->utime += tsk->utime;
+               sig->stime += tsk->stime;
+               sig->gtime += tsk->gtime;
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
-               psig->cutime =
-                       cputime_add(psig->cutime,
-                       cputime_add(tgutime,
-                                   sig->cutime));
-               psig->cstime =
-                       cputime_add(psig->cstime,
-                       cputime_add(tgstime,
-                                   sig->cstime));
-               psig->cgtime =
-                       cputime_add(psig->cgtime,
-                       cputime_add(p->gtime,
-                       cputime_add(sig->gtime,
-                                   sig->cgtime)));
+               psig->cutime += tgutime + sig->cutime;
+               psig->cstime += tgstime + sig->cstime;
+               psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
index da4a6a1..b058c58 100644 (file)
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
  */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
-       tsk->cputime_expires.prof_exp = cputime_zero;
-       tsk->cputime_expires.virt_exp = cputime_zero;
+       tsk->cputime_expires.prof_exp = 0;
+       tsk->cputime_expires.virt_exp = 0;
        tsk->cputime_expires.sched_exp = 0;
        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
        init_sigpending(&p->pending);
 
-       p->utime = cputime_zero;
-       p->stime = cputime_zero;
-       p->gtime = cputime_zero;
-       p->utimescaled = cputime_zero;
-       p->stimescaled = cputime_zero;
+       p->utime = p->stime = p->gtime = 0;
+       p->utimescaled = p->stimescaled = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-       p->prev_utime = cputime_zero;
-       p->prev_stime = cputime_zero;
+       p->prev_utime = p->prev_stime = 0;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
index d802883..22000c3 100644 (file)
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 
        cval = it->expires;
        cinterval = it->incr;
-       if (!cputime_eq(cval, cputime_zero)) {
+       if (cval) {
                struct task_cputime cputime;
                cputime_t t;
 
                thread_group_cputimer(tsk, &cputime);
                if (clock_id == CPUCLOCK_PROF)
-                       t = cputime_add(cputime.utime, cputime.stime);
+                       t = cputime.utime + cputime.stime;
                else
                        /* CPUCLOCK_VIRT */
                        t = cputime.utime;
 
-               if (cputime_le(cval, t))
+               if (cval < t)
                        /* about to fire */
                        cval = cputime_one_jiffy;
                else
-                       cval = cputime_sub(cval, t);
+                       cval = cval - t;
        }
 
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 
        cval = it->expires;
        cinterval = it->incr;
-       if (!cputime_eq(cval, cputime_zero) ||
-           !cputime_eq(nval, cputime_zero)) {
-               if (cputime_gt(nval, cputime_zero))
-                       nval = cputime_add(nval, cputime_one_jiffy);
+       if (cval || nval) {
+               if (nval > 0)
+                       nval += cputime_one_jiffy;
                set_process_cpu_timer(tsk, clock_id, &nval, &cval);
        }
        it->expires = nval;
index 66ff710..30c3c77 100644 (file)
@@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key)
        jump_label_unlock();
 }
 
-void jump_label_dec(struct jump_label_key *key)
+static void __jump_label_dec(struct jump_label_key *key,
+               unsigned long rate_limit, struct delayed_work *work)
 {
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
                return;
 
-       jump_label_update(key, JUMP_LABEL_DISABLE);
+       if (rate_limit) {
+               atomic_inc(&key->enabled);
+               schedule_delayed_work(work, rate_limit);
+       } else
+               jump_label_update(key, JUMP_LABEL_DISABLE);
+
        jump_label_unlock();
 }
 
+static void jump_label_update_timeout(struct work_struct *work)
+{
+       struct jump_label_key_deferred *key =
+               container_of(work, struct jump_label_key_deferred, work.work);
+       __jump_label_dec(&key->key, 0, NULL);
+}
+
+void jump_label_dec(struct jump_label_key *key)
+{
+       __jump_label_dec(key, 0, NULL);
+}
+
+void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+{
+       __jump_label_dec(&key->key, key->timeout, &key->work);
+}
+
+
+void jump_label_rate_limit(struct jump_label_key_deferred *key,
+               unsigned long rl)
+{
+       key->timeout = rl;
+       INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
+}
+
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
        if (entry->code <= (unsigned long)end &&
@@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
  * running code can override this to make the non-live update case
  * cheaper.
  */
-void __weak arch_jump_label_transform_static(struct jump_entry *entry,
+void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
                                            enum jump_label_type type)
 {
        arch_jump_label_transform(entry, type); 
@@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod)
        if (iter_start == iter_stop)
                return;
 
-       for (iter = iter_start; iter < iter_stop; iter++)
-               arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+       for (iter = iter_start; iter < iter_stop; iter++) {
+               struct jump_label_key *iterk;
+
+               iterk = (struct jump_label_key *)(unsigned long)iter->key;
+               arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                               JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+       }
 }
 
 static int jump_label_add_module(struct module *mod)
@@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod)
                key->next = jlm;
 
                if (jump_label_enabled(key))
-                       __jump_label_update(key, iter, iter_stop,
-                                           JUMP_LABEL_ENABLE);
+                       __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
        }
 
        return 0;
index b2e08c9..8889f7d 100644 (file)
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth;
  * about it later on, in lockdep_info().
  */
 static int lockdep_init_error;
+static const char *lock_init_error;
 static unsigned long lockdep_init_trace_data[20];
 static struct stack_trace lockdep_init_trace = {
        .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
 
-static int __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN];
        const char *name;
 
-       name = class->name;
-       if (!name)
-               name = __get_key_name(class->key, str);
-
-       return printk("%s", name);
-}
-
-static void print_lock_name(struct lock_class *class)
-{
-       char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
-       const char *name;
-
-       get_usage_chars(class, usage);
-
        name = class->name;
        if (!name) {
                name = __get_key_name(class->key, str);
-               printk(" (%s", name);
+               printk("%s", name);
        } else {
-               printk(" (%s", name);
+               printk("%s", name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                if (class->subclass)
                        printk("/%d", class->subclass);
        }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+       char usage[LOCK_USAGE_CHARS];
+
+       get_usage_chars(class, usage);
+
+       printk(" (");
+       __print_lock_name(class);
        printk("){%s}", usage);
 }
 
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
        }
 }
 
-static void print_kernel_version(void)
+static void print_kernel_ident(void)
 {
-       printk("%s %.*s\n", init_utsname()->release,
+       printk("%s %.*s %s\n", init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
+               init_utsname()->version,
+               print_tainted());
 }
 
 static int very_verbose(struct lock_class *class)
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        if (unlikely(!lockdep_initialized)) {
                lockdep_init();
                lockdep_init_error = 1;
+               lock_init_error = lock->name;
                save_stack_trace(&lockdep_init_trace);
        }
 #endif
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
-               return class;
+               goto out_set_class_cache;
 
        /*
         * Debug-check: all keys must be persistent!
@@ -808,6 +807,7 @@ out_unlock_set:
        graph_unlock();
        raw_local_irq_restore(flags);
 
+out_set_class_cache:
        if (!subclass || force)
                lock->class_cache[0] = class;
        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        printk("\n");
        printk("======================================================\n");
        printk("[ INFO: possible circular locking dependency detected ]\n");
-       print_kernel_version();
+       print_kernel_ident();
        printk("-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("======================================================\n");
        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
-       print_kernel_version();
+       print_kernel_ident();
        printk("------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        printk("\n");
        printk("=============================================\n");
        printk("[ INFO: possible recursive locking detected ]\n");
-       print_kernel_version();
+       print_kernel_ident();
        printk("---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        printk("\n");
        printk("=================================\n");
        printk("[ INFO: inconsistent lock state ]\n");
-       print_kernel_version();
+       print_kernel_ident();
        printk("---------------------------------\n");
 
        printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
        printk("\n");
        printk("=========================================================\n");
        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
-       print_kernel_version();
+       print_kernel_ident();
        printk("---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        printk("\n");
        printk("=====================================\n");
        printk("[ BUG: bad unlock balance detected! ]\n");
+       print_kernel_ident();
        printk("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        printk("\n");
        printk("=================================\n");
        printk("[ BUG: bad contention detected! ]\n");
+       print_kernel_ident();
        printk("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void)
 
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-               printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+               printk("WARNING: lockdep init error! lock-%s was acquired"
+                       "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        printk("\n");
        printk("=========================\n");
        printk("[ BUG: held lock freed! ]\n");
+       print_kernel_ident();
        printk("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
        printk("\n");
        printk("=====================================\n");
        printk("[ BUG: lock held at task exit time! ]\n");
+       print_kernel_ident();
        printk("-------------------------------------\n");
        printk("%s/%d is exiting with locks still held!\n",
                curr->comm, task_pid_nr(curr));
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void)
                printk("\n");
                printk("================================================\n");
                printk("[ BUG: lock held when returning to user space! ]\n");
+               print_kernel_ident();
                printk("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        printk("\n");
        printk("===============================\n");
        printk("[ INFO: suspicious RCU usage. ]\n");
+       print_kernel_ident();
        printk("-------------------------------\n");
        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+
+       /*
+        * If a CPU is in the RCU-free window in idle (ie: in the section
+        * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+        * considers that CPU to be in an "extended quiescent state",
+        * which means that RCU will be completely ignoring that CPU.
+        * Therefore, rcu_read_lock() and friends have absolutely no
+        * effect on a CPU running in that state. In other words, even if
+        * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+        * delete data structures out from under it.  RCU really has no
+        * choice here: we need to keep an RCU-free window in idle where
+        * the CPU may possibly enter into low power mode. This way we can
+        * notice an extended quiescent state to other CPUs that started a grace
+        * period. Otherwise we would delay any grace period as long as we run
+        * in the idle task.
+        *
+        * So complain bitterly if someone does call rcu_read_lock(),
+        * rcu_read_lock_bh() and so on from extended quiescent states.
+        */
+       if (rcu_is_cpu_idle())
+               printk("RCU used illegally from extended quiescent state!\n");
+
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
index b265936..3458469 100644 (file)
@@ -237,11 +237,20 @@ void add_taint(unsigned flag)
         * Can't trust the integrity of the kernel anymore.
         * We don't call directly debug_locks_off() because the issue
         * is not necessarily serious enough to set oops_in_progress to 1
-        * Also we want to keep up lockdep for staging development and
-        * post-warning case.
+        * Also we want to keep up lockdep for staging/out-of-tree
+        * development and post-warning case.
         */
-       if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
-               printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+       switch (flag) {
+       case TAINT_CRAP:
+       case TAINT_OOT_MODULE:
+       case TAINT_WARN:
+       case TAINT_FIRMWARE_WORKAROUND:
+               break;
+
+       default:
+               if (__debug_locks_off())
+                       printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+       }
 
        set_bit(flag, &tainted_mask);
 }
index e7cb76d..125cb67 100644 (file)
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                return now.sched < then.sched;
        }  else {
-               return cputime_lt(now.cpu, then.cpu);
+               return now.cpu < then.cpu;
        }
 }
 static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                acc->sched += val.sched;
        }  else {
-               acc->cpu = cputime_add(acc->cpu, val.cpu);
+               acc->cpu += val.cpu;
        }
 }
 static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,24 +98,11 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                a.sched -= b.sched;
        }  else {
-               a.cpu = cputime_sub(a.cpu, b.cpu);
+               a.cpu -= b.cpu;
        }
        return a;
 }
 
-/*
- * Divide and limit the result to res >= 1
- *
- * This is necessary to prevent signal delivery starvation, when the result of
- * the division would be rounded down to 0.
- */
-static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
-{
-       cputime_t res = cputime_div(time, div);
-
-       return max_t(cputime_t, res, 1);
-}
-
 /*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
        } else {
                cputime_t delta, incr;
 
-               if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+               if (now.cpu < timer->it.cpu.expires.cpu)
                        return;
                incr = timer->it.cpu.incr.cpu;
-               delta = cputime_sub(cputime_add(now.cpu, incr),
-                                   timer->it.cpu.expires.cpu);
+               delta = now.cpu + incr - timer->it.cpu.expires.cpu;
                /* Don't use (incr*2 < delta), incr*2 might overflow. */
-               for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
-                            incr = cputime_add(incr, incr);
-               for (; i >= 0; incr = cputime_halve(incr), i--) {
-                       if (cputime_lt(delta, incr))
+               for (i = 0; incr < delta - incr; i++)
+                            incr += incr;
+               for (; i >= 0; incr = incr >> 1, i--) {
+                       if (delta < incr)
                                continue;
-                       timer->it.cpu.expires.cpu =
-                               cputime_add(timer->it.cpu.expires.cpu, incr);
+                       timer->it.cpu.expires.cpu += incr;
                        timer->it_overrun += 1 << i;
-                       delta = cputime_sub(delta, incr);
+                       delta -= incr;
                }
        }
 }
 
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-       return cputime_add(p->utime, p->stime);
+       return p->utime + p->stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 
        t = tsk;
        do {
-               times->utime = cputime_add(times->utime, t->utime);
-               times->stime = cputime_add(times->stime, t->stime);
+               times->utime += t->utime;
+               times->stime += t->stime;
                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
@@ -258,10 +243,10 @@ out:
 
 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 {
-       if (cputime_gt(b->utime, a->utime))
+       if (b->utime > a->utime)
                a->utime = b->utime;
 
-       if (cputime_gt(b->stime, a->stime))
+       if (b->stime > a->stime)
                a->stime = b->stime;
 
        if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                return -EINVAL;
        case CPUCLOCK_PROF:
                thread_group_cputime(p, &cputime);
-               cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+               cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
                           unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
-       cputime_t ptime = cputime_add(utime, stime);
+       cputime_t ptime = utime + stime;
 
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-               if (cputime_lt(timer->expires.cpu, ptime)) {
-                       timer->expires.cpu = cputime_zero;
+               if (timer->expires.cpu < ptime) {
+                       timer->expires.cpu = 0;
                } else {
-                       timer->expires.cpu = cputime_sub(timer->expires.cpu,
-                                                        ptime);
+                       timer->expires.cpu -= ptime;
                }
        }
 
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-               if (cputime_lt(timer->expires.cpu, utime)) {
-                       timer->expires.cpu = cputime_zero;
+               if (timer->expires.cpu < utime) {
+                       timer->expires.cpu = 0;
                } else {
-                       timer->expires.cpu = cputime_sub(timer->expires.cpu,
-                                                        utime);
+                       timer->expires.cpu -= utime;
                }
        }
 
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        struct signal_struct *const sig = tsk->signal;
 
        cleanup_timers(tsk->signal->cpu_timers,
-                      cputime_add(tsk->utime, sig->utime),
-                      cputime_add(tsk->stime, sig->stime),
+                      tsk->utime + sig->utime, tsk->stime + sig->stime,
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 {
-       return cputime_eq(expires, cputime_zero) ||
-              cputime_gt(expires, new_exp);
+       return expires == 0 || expires > new_exp;
 }
 
 /*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-               cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+               cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
        unsigned long soft;
 
        maxfire = 20;
-       tsk->cputime_expires.prof_exp = cputime_zero;
+       tsk->cputime_expires.prof_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.prof_exp = t->expires.cpu;
                        break;
                }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
 
        ++timers;
        maxfire = 20;
-       tsk->cputime_expires.virt_exp = cputime_zero;
+       tsk->cputime_expires.virt_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.virt_exp = t->expires.cpu;
                        break;
                }
@@ -1009,20 +990,19 @@ static u32 onecputick;
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                             cputime_t *expires, cputime_t cur_time, int signo)
 {
-       if (cputime_eq(it->expires, cputime_zero))
+       if (!it->expires)
                return;
 
-       if (cputime_ge(cur_time, it->expires)) {
-               if (!cputime_eq(it->incr, cputime_zero)) {
-                       it->expires = cputime_add(it->expires, it->incr);
+       if (cur_time >= it->expires) {
+               if (it->incr) {
+                       it->expires += it->incr;
                        it->error += it->incr_error;
                        if (it->error >= onecputick) {
-                               it->expires = cputime_sub(it->expires,
-                                                         cputime_one_jiffy);
+                               it->expires -= cputime_one_jiffy;
                                it->error -= onecputick;
                        }
                } else {
-                       it->expires = cputime_zero;
+                       it->expires = 0;
                }
 
                trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
        }
 
-       if (!cputime_eq(it->expires, cputime_zero) &&
-           (cputime_eq(*expires, cputime_zero) ||
-            cputime_lt(it->expires, *expires))) {
+       if (it->expires && (!*expires || it->expires < *expires)) {
                *expires = it->expires;
        }
 }
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
  */
 static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
-       if (cputime_eq(cputime->utime, cputime_zero) &&
-           cputime_eq(cputime->stime, cputime_zero) &&
-           cputime->sum_exec_runtime == 0)
+       if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
                return 1;
        return 0;
 }
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
         */
        thread_group_cputimer(tsk, &cputime);
        utime = cputime.utime;
-       ptime = cputime_add(utime, cputime.stime);
+       ptime = utime + cputime.stime;
        sum_sched_runtime = cputime.sum_exec_runtime;
        maxfire = 20;
-       prof_expires = cputime_zero;
+       prof_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+               if (!--maxfire || ptime < tl->expires.cpu) {
                        prof_expires = tl->expires.cpu;
                        break;
                }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
 
        ++timers;
        maxfire = 20;
-       virt_expires = cputime_zero;
+       virt_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+               if (!--maxfire || utime < tl->expires.cpu) {
                        virt_expires = tl->expires.cpu;
                        break;
                }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
                        }
                }
                x = secs_to_cputime(soft);
-               if (cputime_eq(prof_expires, cputime_zero) ||
-                   cputime_lt(x, prof_expires)) {
+               if (!prof_expires || x < prof_expires) {
                        prof_expires = x;
                }
        }
@@ -1249,12 +1224,9 @@ out:
 static inline int task_cputime_expired(const struct task_cputime *sample,
                                        const struct task_cputime *expires)
 {
-       if (!cputime_eq(expires->utime, cputime_zero) &&
-           cputime_ge(sample->utime, expires->utime))
+       if (expires->utime && sample->utime >= expires->utime)
                return 1;
-       if (!cputime_eq(expires->stime, cputime_zero) &&
-           cputime_ge(cputime_add(sample->utime, sample->stime),
-                      expires->stime))
+       if (expires->stime && sample->utime + sample->stime >= expires->stime)
                return 1;
        if (expires->sum_exec_runtime != 0 &&
            sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 * it to be relative, *newval argument is relative and we update
                 * it to be absolute.
                 */
-               if (!cputime_eq(*oldval, cputime_zero)) {
-                       if (cputime_le(*oldval, now.cpu)) {
+               if (*oldval) {
+                       if (*oldval <= now.cpu) {
                                /* Just about to fire. */
                                *oldval = cputime_one_jiffy;
                        } else {
-                               *oldval = cputime_sub(*oldval, now.cpu);
+                               *oldval -= now.cpu;
                        }
                }
 
-               if (cputime_eq(*newval, cputime_zero))
+               if (!*newval)
                        return;
-               *newval = cputime_add(*newval, now.cpu);
+               *newval += now.cpu;
        }
 
        /*
index 7982a0a..989e4a5 100644 (file)
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
                unsigned long mem;
 
                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
-               if (mem == MEMBLOCK_ERROR)
+               if (!mem)
                        return;
                new_log_buf = __va(mem);
        } else {
@@ -688,6 +688,7 @@ static void zap_locks(void)
 
        oops_timestamp = jiffies;
 
+       debug_locks_off();
        /* If a crash is occurring, make sure we can't deadlock */
        raw_spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        boot_delay_msec();
        printk_delay();
 
-       preempt_disable();
        /* This stops the holder of console_sem just where we want him */
-       raw_local_irq_save(flags);
+       local_irq_save(flags);
        this_cpu = smp_processor_id();
 
        /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * recursion and return - but flag the recursion so that
                 * it can be printed at the next appropriate moment:
                 */
-               if (!oops_in_progress) {
+               if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
                        goto out_restore_irqs;
                }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 
        lockdep_on();
 out_restore_irqs:
-       raw_local_irq_restore(flags);
+       local_irq_restore(flags);
 
-       preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
index f600868..aa88baa 100644 (file)
 #define RCU_TRACE(stmt)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
+/*
+ * Process-level increment to ->dynticks_nesting field.  This allows for
+ * architectures that use half-interrupts and half-exceptions from
+ * process context.
+ */
+#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+
 /*
  * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
  * by call_rcu() and rcu callback execution, and are therefore not part of the
index c5b98e5..2bc4e13 100644 (file)
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
+       if (rcu_is_cpu_idle())
+               return 0;
        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+{
+       trace_rcu_torture_read(rcutorturename, rhp);
+}
+EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
index 636af6d..977296d 100644 (file)
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
 
 #include "rcutiny_plugin.h"
 
-#ifdef CONFIG_NO_HZ
+static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
 
-static long rcu_dynticks_nesting = 1;
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+static void rcu_idle_enter_common(long long oldval)
+{
+       if (rcu_dynticks_nesting) {
+               RCU_TRACE(trace_rcu_dyntick("--=",
+                                           oldval, rcu_dynticks_nesting));
+               return;
+       }
+       RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
+
+               RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+                                           oldval, rcu_dynticks_nesting));
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+       rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
 
 /*
- * Enter dynticks-idle mode, which is an extended quiescent state
- * if we have fully entered that mode (i.e., if the new value of
- * dynticks_nesting is zero).
+ * Enter idle, which is an extended quiescent state if we have fully
+ * entered that mode (i.e., if the new value of dynticks_nesting is zero).
  */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
-       if (--rcu_dynticks_nesting == 0)
-               rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+       unsigned long flags;
+       long long oldval;
+
+       local_irq_save(flags);
+       oldval = rcu_dynticks_nesting;
+       rcu_dynticks_nesting = 0;
+       rcu_idle_enter_common(oldval);
+       local_irq_restore(flags);
 }
 
 /*
- * Exit dynticks-idle mode, so that we are no longer in an extended
- * quiescent state.
+ * Exit an interrupt handler towards idle.
  */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
+{
+       unsigned long flags;
+       long long oldval;
+
+       local_irq_save(flags);
+       oldval = rcu_dynticks_nesting;
+       rcu_dynticks_nesting--;
+       WARN_ON_ONCE(rcu_dynticks_nesting < 0);
+       rcu_idle_enter_common(oldval);
+       local_irq_restore(flags);
+}
+
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+static void rcu_idle_exit_common(long long oldval)
 {
+       if (oldval) {
+               RCU_TRACE(trace_rcu_dyntick("++=",
+                                           oldval, rcu_dynticks_nesting));
+               return;
+       }
+       RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
+
+               RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+                         oldval, rcu_dynticks_nesting));
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+}
+
+/*
+ * Exit idle, so that we are no longer in an extended quiescent state.
+ */
+void rcu_idle_exit(void)
+{
+       unsigned long flags;
+       long long oldval;
+
+       local_irq_save(flags);
+       oldval = rcu_dynticks_nesting;
+       WARN_ON_ONCE(oldval != 0);
+       rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+       rcu_idle_exit_common(oldval);
+       local_irq_restore(flags);
+}
+
+/*
+ * Enter an interrupt handler, moving away from idle.
+ */
+void rcu_irq_enter(void)
+{
+       unsigned long flags;
+       long long oldval;
+
+       local_irq_save(flags);
+       oldval = rcu_dynticks_nesting;
        rcu_dynticks_nesting++;
+       WARN_ON_ONCE(rcu_dynticks_nesting == 0);
+       rcu_idle_exit_common(oldval);
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Test whether RCU thinks that the current CPU is idle.
+ */
+int rcu_is_cpu_idle(void)
+{
+       return !rcu_dynticks_nesting;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+
+#endif /* #ifdef CONFIG_PROVE_RCU */
 
-#endif /* #ifdef CONFIG_NO_HZ */
+/*
+ * Test whether the current CPU was interrupted from idle.  Nested
+ * interrupts don't count, we must be running at the first interrupt
+ * level.
+ */
+int rcu_is_cpu_rrupt_from_idle(void)
+{
+       return rcu_dynticks_nesting <= 0;
+}
 
 /*
  * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
 
 /*
  * Check to see if the scheduling-clock interrupt came from an extended
- * quiescent state, and, if so, tell RCU about it.
+ * quiescent state, and, if so, tell RCU about it.  This function must
+ * be called from hardirq context.  It is normally called from the
+ * scheduling-clock interrupt.
  */
 void rcu_check_callbacks(int cpu, int user)
 {
-       if (user ||
-           (idle_cpu(cpu) &&
-            !in_softirq() &&
-            hardirq_count() <= (1 << HARDIRQ_SHIFT)))
+       if (user || rcu_is_cpu_rrupt_from_idle())
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail) {
                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
-               RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
+               RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
+                                             ACCESS_ONCE(rcp->rcucblist),
+                                             need_resched(),
+                                             is_idle_task(current),
+                                             rcu_is_callbacks_kthread()));
                return;
        }
 
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-       RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
+       RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+                                     is_idle_task(current),
+                                     rcu_is_callbacks_kthread()));
 }
 
 static void rcu_process_callbacks(struct softirq_action *unused)
index 2b0484a..9cb1ae4 100644 (file)
@@ -312,8 +312,8 @@ static int rcu_boost(void)
        rt_mutex_lock(&mtx);
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
 
-       return rcu_preempt_ctrlblk.boost_tasks != NULL ||
-              rcu_preempt_ctrlblk.exp_tasks != NULL;
+       return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
+              ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
 }
 
 /*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
        wake_up(&rcu_kthread_wq);
 }
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+       return rcu_kthread_task == current;
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 /*
  * This kthread invokes RCU callbacks whose grace periods have
  * elapsed.  It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
        raise_softirq(RCU_SOFTIRQ);
 }
 
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * There is no callback kthread, so this thread is never it.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+       return false;
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
 void rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
index 764825c..88f17b8 100644 (file)
@@ -61,9 +61,11 @@ static int test_no_idle_hz;  /* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;                /* Start/stop testing interval (in sec) */
 static int irqreader = 1;      /* RCU readers from irq (timers). */
-static int fqs_duration = 0;   /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff = 0;    /* Hold time within burst (us). */
+static int fqs_duration;       /* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff;                /* Hold time within burst (us). */
 static int fqs_stutter = 3;    /* Wait time between bursts (s). */
+static int onoff_interval;     /* Wait time between CPU hotplugs, 0=disable. */
+static int shutdown_secs;      /* Shutdown time (s).  <=0 for no shutdown. */
 static int test_boost = 1;     /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(shutdown_secs, int, 0444);
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *shutdown_task;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *onoff_task;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
+static long n_offline_attempts;
+static long n_offline_successes;
+static long n_online_attempts;
+static long n_online_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
 
@@ -160,6 +174,8 @@ static int stutter_pause_test;
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
 
+static unsigned long shutdown_time;    /* jiffies to system shutdown. */
 static unsigned long boost_starttime;  /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);             /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
  */
 static DEFINE_MUTEX(fullstop_mutex);
 
+/* Forward reference. */
+static void rcu_torture_cleanup(void);
+
 /*
  * Detect and respond to a system shutdown.
  */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
        .name           = "srcu"
 };
 
+static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
+{
+       return srcu_read_lock_raw(&srcu_ctl);
+}
+
+static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
+{
+       srcu_read_unlock_raw(&srcu_ctl, idx);
+}
+
+static struct rcu_torture_ops srcu_raw_ops = {
+       .init           = srcu_torture_init,
+       .cleanup        = srcu_torture_cleanup,
+       .readlock       = srcu_torture_read_lock_raw,
+       .read_delay     = srcu_read_delay,
+       .readunlock     = srcu_torture_read_unlock_raw,
+       .completed      = srcu_torture_completed,
+       .deferred_free  = rcu_sync_torture_deferred_free,
+       .sync           = srcu_torture_synchronize,
+       .cb_barrier     = NULL,
+       .stats          = srcu_torture_stats,
+       .name           = "srcu_raw"
+};
+
 static void srcu_torture_synchronize_expedited(void)
 {
        synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
 
+void rcutorture_trace_dump(void)
+{
+       static atomic_t beenhere = ATOMIC_INIT(0);
+
+       if (atomic_read(&beenhere))
+               return;
+       if (atomic_xchg(&beenhere, 1) != 0)
+               return;
+       do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
+       ftrace_dump(DUMP_ALL);
+}
+
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
+       do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
+       if (pipe_count > 1)
+               rcutorture_trace_dump();
        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
+               do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
+               if (pipe_count > 1)
+                       rcutorture_trace_dump();
                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
                       "rtmbe: %d rtbke: %ld rtbre: %ld "
-                      "rtbf: %ld rtb: %ld nt: %ld",
+                      "rtbf: %ld rtb: %ld nt: %ld "
+                      "onoff: %ld/%ld:%ld/%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
                       n_rcu_torture_boost_rterror,
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
-                      n_rcu_torture_timers);
+                      n_rcu_torture_timers,
+                      n_online_successes,
+                      n_online_attempts,
+                      n_offline_successes,
+                      n_offline_attempts);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
                "shuffle_interval=%d stutter=%d irqreader=%d "
                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
                "test_boost=%d/%d test_boost_interval=%d "
-               "test_boost_duration=%d\n",
+               "test_boost_duration=%d shutdown_secs=%d "
+               "onoff_interval=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
                test_boost, cur_ops->can_boost,
-               test_boost_interval, test_boost_duration);
+               test_boost_interval, test_boost_duration, shutdown_secs,
+               onoff_interval);
 }
 
 static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
        return 0;
 }
 
+/*
+ * Cause the rcutorture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs module parameter.
+ */
+static int
+rcu_torture_shutdown(void *arg)
+{
+       long delta;
+       unsigned long jiffies_snap;
+
+       VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+       jiffies_snap = ACCESS_ONCE(jiffies);
+       while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+              !kthread_should_stop()) {
+               delta = shutdown_time - jiffies_snap;
+               if (verbose)
+                       printk(KERN_ALERT "%s" TORTURE_FLAG
+                              "rcu_torture_shutdown task: %lu "
+                              "jiffies remaining\n",
+                              torture_type, delta);
+               schedule_timeout_interruptible(delta);
+               jiffies_snap = ACCESS_ONCE(jiffies);
+       }
+       if (kthread_should_stop()) {
+               VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+               return 0;
+       }
+
+       /* OK, shut down the system. */
+
+       VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+       shutdown_task = NULL;   /* Avoid self-kill deadlock. */
+       rcu_torture_cleanup();  /* Get the success/failure message. */
+       kernel_power_off();     /* Shut down the system. */
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+rcu_torture_onoff(void *arg)
+{
+       int cpu;
+       int maxcpu = -1;
+       DEFINE_RCU_RANDOM(rand);
+
+       VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+       for_each_online_cpu(cpu)
+               maxcpu = cpu;
+       WARN_ON(maxcpu < 0);
+       while (!kthread_should_stop()) {
+               cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
+               if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+                       if (verbose)
+                               printk(KERN_ALERT "%s" TORTURE_FLAG
+                                      "rcu_torture_onoff task: offlining %d\n",
+                                      torture_type, cpu);
+                       n_offline_attempts++;
+                       if (cpu_down(cpu) == 0) {
+                               if (verbose)
+                                       printk(KERN_ALERT "%s" TORTURE_FLAG
+                                              "rcu_torture_onoff task: "
+                                              "offlined %d\n",
+                                              torture_type, cpu);
+                               n_offline_successes++;
+                       }
+               } else if (cpu_is_hotpluggable(cpu)) {
+                       if (verbose)
+                               printk(KERN_ALERT "%s" TORTURE_FLAG
+                                      "rcu_torture_onoff task: onlining %d\n",
+                                      torture_type, cpu);
+                       n_online_attempts++;
+                       if (cpu_up(cpu) == 0) {
+                               if (verbose)
+                                       printk(KERN_ALERT "%s" TORTURE_FLAG
+                                              "rcu_torture_onoff task: "
+                                              "onlined %d\n",
+                                              torture_type, cpu);
+                               n_online_successes++;
+                       }
+               }
+               schedule_timeout_interruptible(onoff_interval * HZ);
+       }
+       VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+       return 0;
+}
+
+static int
+rcu_torture_onoff_init(void)
+{
+       if (onoff_interval <= 0)
+               return 0;
+       onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
+       if (IS_ERR(onoff_task)) {
+               onoff_task = NULL;
+               return PTR_ERR(onoff_task);
+       }
+       return 0;
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+       if (onoff_task == NULL)
+               return;
+       VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+       kthread_stop(onoff_task);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void
+rcu_torture_onoff_init(void)
+{
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
 static int rcutorture_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
                for_each_possible_cpu(i)
                        rcutorture_booster_cleanup(i);
        }
+       if (shutdown_task != NULL) {
+               VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+               kthread_stop(shutdown_task);
+       }
+       rcu_torture_onoff_cleanup();
 
        /* Wait for all RCU callbacks to fire.  */
 
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                 &srcu_ops, &srcu_expedited_ops,
+                 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
        mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
                        }
                }
        }
+       if (shutdown_secs > 0) {
+               shutdown_time = jiffies + shutdown_secs * HZ;
+               shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
+                                           "rcu_torture_shutdown");
+               if (IS_ERR(shutdown_task)) {
+                       firsterr = PTR_ERR(shutdown_task);
+                       VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+                       shutdown_task = NULL;
+                       goto unwind;
+               }
+       }
+       rcu_torture_onoff_init();
        register_reboot_notifier(&rcutorture_shutdown_nb);
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
index 6b76d81..6c4a672 100644 (file)
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
                NUM_RCU_LVL_3, \
                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
        }, \
-       .signaled = RCU_GP_IDLE, \
+       .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
-#ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-       .dynticks_nesting = 1,
+       .dynticks_nesting = DYNTICK_TASK_NESTING,
        .dynticks = ATOMIC_INIT(1),
 };
-#endif /* #ifdef CONFIG_NO_HZ */
 
 static int blimit = 10;                /* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;    /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
                return 1;
        }
 
-       /* If preemptible RCU, no point in sending reschedule IPI. */
-       if (rdp->preemptible)
-               return 0;
-
-       /* The CPU is online, so send it a reschedule IPI. */
+       /*
+        * The CPU is online, so send it a reschedule IPI.  This forces
+        * it through the scheduler, and (inefficiently) also handles cases
+        * where idle loops fail to inform RCU about the CPU being idle.
+        */
        if (rdp->cpu != smp_processor_id())
                smp_send_reschedule(rdp->cpu);
        else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 
 #endif /* #ifdef CONFIG_SMP */
 
-#ifdef CONFIG_NO_HZ
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+       trace_rcu_dyntick("Start", oldval, 0);
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
+
+               trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+       rcu_prepare_for_idle(smp_processor_id());
+       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+       smp_mb__before_atomic_inc();  /* See above. */
+       atomic_inc(&rdtp->dynticks);
+       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
 
 /**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
  *
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
  * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
  */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
        unsigned long flags;
+       long long oldval;
        struct rcu_dynticks *rdtp;
 
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-       if (--rdtp->dynticks_nesting) {
-               local_irq_restore(flags);
-               return;
-       }
-       trace_rcu_dyntick("Start");
-       /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-       smp_mb__before_atomic_inc();  /* See above. */
-       atomic_inc(&rdtp->dynticks);
-       smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-       WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting = 0;
+       rcu_idle_enter_common(rdtp, oldval);
        local_irq_restore(flags);
 }
 
-/*
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
  *
- * Exit nohz mode, in other words, -enter- the mode in which RCU
- * read-side critical sections normally occur.
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
  */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
 {
        unsigned long flags;
+       long long oldval;
        struct rcu_dynticks *rdtp;
 
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-       if (rdtp->dynticks_nesting++) {
-               local_irq_restore(flags);
-               return;
-       }
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting--;
+       WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+       if (rdtp->dynticks_nesting)
+               trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+       else
+               rcu_idle_enter_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
-       trace_rcu_dyntick("End");
+       rcu_cleanup_after_idle(smp_processor_id());
+       trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+       if (!is_idle_task(current)) {
+               struct task_struct *idle = idle_task(smp_processor_id());
+
+               trace_rcu_dyntick("Error on exit: not idle task",
+                                 oldval, rdtp->dynticks_nesting);
+               ftrace_dump(DUMP_ALL);
+               WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                         current->pid, current->comm,
+                         idle->pid, idle->comm); /* must be idle task! */
+       }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       WARN_ON_ONCE(oldval != 0);
+       rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+       rcu_idle_exit_common(rdtp, oldval);
+       local_irq_restore(flags);
+}
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode!  This code assumes that the idle loop never does upcalls to
+ * user mode.  If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+       unsigned long flags;
+       struct rcu_dynticks *rdtp;
+       long long oldval;
+
+       local_irq_save(flags);
+       rdtp = &__get_cpu_var(rcu_dynticks);
+       oldval = rdtp->dynticks_nesting;
+       rdtp->dynticks_nesting++;
+       WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+       if (oldval)
+               trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+       else
+               rcu_idle_exit_common(rdtp, oldval);
        local_irq_restore(flags);
 }
 
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 
+#ifdef CONFIG_PROVE_RCU
+
 /**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
  *
- * If the CPU was idle with dynamic ticks active, this updates the
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
  */
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
 {
-       rcu_exit_nohz();
+       int ret;
+
+       preempt_disable();
+       ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+       preempt_enable();
+       return ret;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+
+#endif /* #ifdef CONFIG_PROVE_RCU */
 
 /**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
  *
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
- * to put let the RCU handling be aware that the CPU is going back to idle
- * with no ticks.
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true.  The caller must have at least
+ * disabled preemption.
  */
-void rcu_irq_exit(void)
+int rcu_is_cpu_rrupt_from_idle(void)
 {
-       rcu_enter_nohz();
+       return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
 
 #ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-       return 0;
+       return (rdp->dynticks_snap & 0x1) == 0;
 }
 
 /*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #endif /* #ifdef CONFIG_SMP */
 
-#else /* #ifdef CONFIG_NO_HZ */
-
-#ifdef CONFIG_SMP
-
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
-       return 0;
-}
-
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
-       return rcu_implicit_offline_qs(rdp);
-}
-
-#endif /* #ifdef CONFIG_SMP */
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-int rcu_cpu_stall_suppress __read_mostly;
-
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
-       WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
-       rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+       WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
+       rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
 
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
                rnp->completed = rsp->completed;
-               rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+               rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 
        rnp = rcu_get_root(rsp);
        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-       rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+       rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 
        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
-       rsp->signaled = RCU_GP_IDLE;
+       rsp->fqs_state = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        else
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
-               rcu_report_exp_rnp(rsp, rnp);
+               rcu_report_exp_rnp(rsp, rnp, true);
        rcu_node_kthread_setaffinity(rnp, -1);
 }
 
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
                trace_rcu_batch_start(rsp->name, 0, 0);
-               trace_rcu_batch_end(rsp->name, 0);
+               trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+                                   need_resched(), is_idle_task(current),
+                                   rcu_is_callbacks_kthread());
                return;
        }
 
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                debug_rcu_head_unqueue(list);
                __rcu_reclaim(rsp->name, list);
                list = next;
-               if (++count >= bl)
+               /* Stop only if limit reached and CPU has something to do. */
+               if (++count >= bl &&
+                   (need_resched() ||
+                    (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
 
        local_irq_save(flags);
-       trace_rcu_batch_end(rsp->name, count);
+       trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+                           is_idle_task(current),
+                           rcu_is_callbacks_kthread());
 
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
  * Also schedule RCU core processing.
  *
- * This function must be called with hardirqs disabled.  It is normally
+ * This function must be called from hardirq context.  It is normally
  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
  * false, there is no point in invoking rcu_check_callbacks().
  */
 void rcu_check_callbacks(int cpu, int user)
 {
        trace_rcu_utilization("Start scheduler-tick");
-       if (user ||
-           (idle_cpu(cpu) && rcu_scheduler_active &&
-            !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+       if (user || rcu_is_cpu_rrupt_from_idle()) {
 
                /*
                 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
        rsp->fqs_active = 1;
-       switch (rsp->signaled) {
+       switch (rsp->fqs_state) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
 
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                force_qs_rnp(rsp, dyntick_save_progress_counter);
                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                if (rcu_gp_in_progress(rsp))
-                       rsp->signaled = RCU_FORCE_QS;
+                       rsp->fqs_state = RCU_FORCE_QS;
                break;
 
        case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
  * by the current CPU, even if none need be done immediately, returning
  * 1 if so.
  */
-static int rcu_needs_cpu_quick_check(int cpu)
+static int rcu_cpu_has_callbacks(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
        rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+       WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+       WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
        rdp->rsp = rsp;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
+       rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
+       atomic_set(&rdp->dynticks->dynticks,
+                  (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+       rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
 
        /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                rcu_send_cbs_to_online(&rcu_bh_state);
                rcu_send_cbs_to_online(&rcu_sched_state);
                rcu_preempt_send_cbs_to_online();
+               rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
index 849ce9e..fddff92 100644 (file)
  * Dynticks per-CPU state.
  */
 struct rcu_dynticks {
-       int dynticks_nesting;   /* Track irq/process nesting level. */
-       int dynticks_nmi_nesting; /* Track NMI nesting level. */
-       atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
+       long long dynticks_nesting; /* Track irq/process nesting level. */
+                                   /* Process level is worth LLONG_MAX/2. */
+       int dynticks_nmi_nesting;   /* Track NMI nesting level. */
+       atomic_t dynticks;          /* Even value for idle, else odd. */
 };
 
 /* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
 
-#ifdef CONFIG_NO_HZ
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-#endif /* #ifdef CONFIG_NO_HZ */
 
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
-#ifdef CONFIG_NO_HZ
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
-#endif /* #ifdef CONFIG_NO_HZ */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
        unsigned long resched_ipi;      /* Sent a resched IPI. */
 
@@ -302,16 +299,12 @@ struct rcu_data {
        struct rcu_state *rsp;
 };
 
-/* Values for signaled field in struct rcu_state. */
+/* Values for fqs_state field in struct rcu_state. */
 #define RCU_GP_IDLE            0       /* No grace period in progress. */
 #define RCU_GP_INIT            1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK       2       /* Need to scan dyntick state. */
 #define RCU_FORCE_QS           3       /* Need to force quiescent state. */
-#ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT                RCU_SAVE_DYNTICK
-#else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT                RCU_FORCE_QS
-#endif /* #else #ifdef CONFIG_NO_HZ */
 
 #define RCU_JIFFIES_TILL_FORCE_QS       3      /* for rsp->jiffies_force_qs */
 
@@ -361,7 +354,7 @@ struct rcu_state {
 
        /* The following fields are guarded by the root rcu_node's lock. */
 
-       u8      signaled ____cacheline_internodealigned_in_smp;
+       u8      fqs_state ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
        u8      fqs_active;                     /* force_quiescent_state() */
                                                /*  is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                              bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_cleanup_after_idle(int cpu);
+static void rcu_prepare_for_idle(int cpu);
 
 #endif /* #ifndef RCU_TREE_NONCORE */
index 4b9b9f8..8bb35d7 100644 (file)
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
+       int empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
-                * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
+                * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+                * so we must take a snapshot of the expedited state.
                 */
+               empty_exp_now = !rcu_preempted_readers_exp(rnp);
                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report("preempt_rcu",
                                                         rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
                 */
-               if (!empty_exp && !rcu_preempted_readers_exp(rnp))
-                       rcu_report_exp_rnp(&rcu_preempt_state, rnp);
+               if (!empty_exp && empty_exp_now)
+                       rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
        } else {
                local_irq_restore(flags);
        }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
+ * Most callers will set the "wake" flag, but the task initiating the
+ * expedited grace period need not wake itself.
+ *
  * Caller must hold sync_rcu_preempt_exp_mutex.
  */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                              bool wake)
 {
        unsigned long flags;
        unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                }
                if (rnp->parent == NULL) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                       wake_up(&sync_rcu_preempt_exp_wq);
+                       if (wake)
+                               wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
                mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
                must_wait = 1;
        }
        if (!must_wait)
-               rcu_report_exp_rnp(rsp, rnp);
+               rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 
 /*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
  * report on tasks preempted in RCU read-side critical sections during
  * expedited RCU grace periods.
  */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                              bool wake)
 {
-       return;
 }
 
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
-static struct lock_class_key rcu_boost_class;
-
 /*
  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
-       /* Avoid lockdep false positives.  This rt_mutex is its own thing. */
-       lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
-                                  "rcu_boost_mutex");
        t->rcu_boost_mutex = &mtx;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
 
-       return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+       return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+              ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
 
 /*
@@ -1328,6 +1332,15 @@ static void invoke_rcu_callbacks_kthread(void)
        local_irq_restore(flags);
 }
 
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+       return __get_cpu_var(rcu_cpu_kthread_task) == current;
+}
+
 /*
  * Set the affinity of the boost kthread.  The CPU-hotplug locks are
  * held, so no one should be messing with the existence of the boost
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
        WARN_ON_ONCE(1);
 }
 
+static bool rcu_is_callbacks_kthread(void)
+{
+       return false;
+}
+
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
                 * grace period works for us.
                 */
                get_online_cpus();
-               snap = atomic_read(&sync_sched_expedited_started) - 1;
+               snap = atomic_read(&sync_sched_expedited_started);
                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
 
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
  * 1 if so.  This function is part of the RCU implementation; it is -not-
  * an exported member of the RCU API.
  *
- * Because we have preemptible RCU, just check whether this CPU needs
- * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
- * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
+ * any flavor of RCU.
  */
 int rcu_needs_cpu(int cpu)
 {
-       return rcu_needs_cpu_quick_check(cpu);
+       return rcu_cpu_has_callbacks(cpu);
+}
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+}
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(int cpu)
+{
 }
 
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 
-#define RCU_NEEDS_CPU_FLUSHES 5
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode.  This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
+ *     to satisfy RCU.  Beyond this point, it is better to incur a periodic
+ *     scheduling-clock interrupt than to loop through the state machine
+ *     at full power.
+ * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
+ *     optional if RCU does not need anything immediately from this
+ *     CPU, even if this CPU still has RCU callbacks queued.  The first
+ *     times through the state machine are mandatory: we need to give
+ *     the state machine a chance to communicate a quiescent state
+ *     to the RCU core.
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ *     to sleep in dyntick-idle mode with RCU callbacks pending.  This
+ *     is sized to be roughly one RCU grace period.  Those energy-efficiency
+ *     benchmarkers who might otherwise be tempted to set this to a large
+ *     number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ *     system.  And if you are -that- concerned about energy efficiency,
+ *     just power the system down and be done with it!
+ *
+ * The values below work well in practice.  If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_FLUSHES 5             /* Number of dyntick-idle tries. */
+#define RCU_IDLE_OPT_FLUSHES 3         /* Optional dyntick-idle tries. */
+#define RCU_IDLE_GP_DELAY 6            /* Roughly one grace period. */
+
 static DEFINE_PER_CPU(int, rcu_dyntick_drain);
 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
+static ktime_t rcu_idle_gp_wait;
 
 /*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ */
+int rcu_needs_cpu(int cpu)
+{
+       /* If no callbacks, RCU doesn't need the CPU. */
+       if (!rcu_cpu_has_callbacks(cpu))
+               return 0;
+       /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
+       return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
+}
+
+/*
+ * Timer handler used to force CPU to start pushing its remaining RCU
+ * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * pending.  The hander doesn't really need to do anything because the
+ * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * interrupt should idle not be re-entered.
+ */
+static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+{
+       trace_rcu_prep_idle("Timer");
+       return HRTIMER_NORESTART;
+}
+
+/*
+ * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+       static int firsttime = 1;
+       struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+
+       hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       hrtp->function = rcu_idle_gp_timer_func;
+       if (firsttime) {
+               unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+
+               rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+               firsttime = 0;
+       }
+}
+
+/*
+ * Clean up for exit from idle.  Because we are exiting from idle, there
+ * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * do nothing if this timer is not active, so just cancel it unconditionally.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+       hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+}
+
+/*
+ * Check to see if any RCU-related work can be done by the current CPU,
+ * and if so, schedule a softirq to get it done.  This function is part
+ * of the RCU implementation; it is -not- an exported member of the RCU API.
  *
- * Because we are not supporting preemptible RCU, attempt to accelerate
- * any current grace periods so that RCU no longer needs this CPU, but
- * only if all other CPUs are already in dynticks-idle mode.  This will
- * allow the CPU cores to be powered down immediately, as opposed to after
- * waiting many milliseconds for grace periods to elapse.
+ * The idea is for the current CPU to clear out all work required by the
+ * RCU core for the current grace period, so that this CPU can be permitted
+ * to enter dyntick-idle mode.  In some cases, it will need to be awakened
+ * at the end of the grace period by whatever CPU ends the grace period.
+ * This allows CPUs to go dyntick-idle more quickly, and to reduce the
+ * number of wakeups by a modest integer factor.
  *
  * Because it is not legal to invoke rcu_process_callbacks() with irqs
  * disabled, we do one pass of force_quiescent_state(), then do a
  * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
  * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ *
+ * The caller must have disabled interrupts.
  */
-int rcu_needs_cpu(int cpu)
+static void rcu_prepare_for_idle(int cpu)
 {
-       int c = 0;
-       int snap;
-       int thatcpu;
-
-       /* Check for being in the holdoff period. */
-       if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
-               return rcu_needs_cpu_quick_check(cpu);
-
-       /* Don't bother unless we are the last non-dyntick-idle CPU. */
-       for_each_online_cpu(thatcpu) {
-               if (thatcpu == cpu)
-                       continue;
-               snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
-                                                    thatcpu).dynticks);
-               smp_mb(); /* Order sampling of snap with end of grace period. */
-               if ((snap & 0x1) != 0) {
-                       per_cpu(rcu_dyntick_drain, cpu) = 0;
-                       per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-                       return rcu_needs_cpu_quick_check(cpu);
-               }
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       /*
+        * If there are no callbacks on this CPU, enter dyntick-idle mode.
+        * Also reset state to avoid prejudicing later attempts.
+        */
+       if (!rcu_cpu_has_callbacks(cpu)) {
+               per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+               per_cpu(rcu_dyntick_drain, cpu) = 0;
+               local_irq_restore(flags);
+               trace_rcu_prep_idle("No callbacks");
+               return;
+       }
+
+       /*
+        * If in holdoff mode, just return.  We will presumably have
+        * refrained from disabling the scheduling-clock tick.
+        */
+       if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+               local_irq_restore(flags);
+               trace_rcu_prep_idle("In holdoff");
+               return;
        }
 
        /* Check and update the rcu_dyntick_drain sequencing. */
        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* First time through, initialize the counter. */
-               per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+               per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+       } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+                  !rcu_pending(cpu)) {
+               /* Can we go dyntick-idle despite still having callbacks? */
+               trace_rcu_prep_idle("Dyntick with callbacks");
+               per_cpu(rcu_dyntick_drain, cpu) = 0;
+               per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+               hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                             rcu_idle_gp_wait, HRTIMER_MODE_REL);
+               return; /* Nothing more to do immediately. */
        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* We have hit the limit, so time to give up. */
                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-               return rcu_needs_cpu_quick_check(cpu);
+               local_irq_restore(flags);
+               trace_rcu_prep_idle("Begin holdoff");
+               invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+               return;
        }
 
-       /* Do one step pushing remaining RCU callbacks through. */
+       /*
+        * Do one step of pushing the remaining RCU callbacks through
+        * the RCU core state machine.
+        */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+       if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
+               local_irq_restore(flags);
+               rcu_preempt_qs(cpu);
+               force_quiescent_state(&rcu_preempt_state, 0);
+               local_irq_save(flags);
+       }
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+               local_irq_restore(flags);
                rcu_sched_qs(cpu);
                force_quiescent_state(&rcu_sched_state, 0);
-               c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+               local_irq_save(flags);
        }
        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+               local_irq_restore(flags);
                rcu_bh_qs(cpu);
                force_quiescent_state(&rcu_bh_state, 0);
-               c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+               local_irq_save(flags);
        }
 
-       /* If RCU callbacks are still pending, RCU still needs this CPU. */
-       if (c)
+       /*
+        * If RCU callbacks are still pending, RCU still needs this CPU.
+        * So try forcing the callbacks through the grace period.
+        */
+       if (rcu_cpu_has_callbacks(cpu)) {
+               local_irq_restore(flags);
+               trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-       return c;
+       } else {
+               local_irq_restore(flags);
+               trace_rcu_prep_idle("Callbacks drained");
+       }
 }
 
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
index 9feffa4..654cfe6 100644 (file)
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
-       seq_printf(m, " dt=%d/%d/%d df=%lu",
+       seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, " ql=%ld qs=%c%c%c%c",
                   rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
-       seq_printf(m, ",%d,%d,%d,%lu",
+       seq_printf(m, ",%d,%llx,%d,%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
-#ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
 #ifdef CONFIG_RCU_BOOST
        seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
-                  rsp->completed, gpnum, rsp->signaled,
+                  rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
index 8eafd1b..16502d3 100644 (file)
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 
        printk("\n============================================\n");
        printk(  "[ BUG: circular locking deadlock detected! ]\n");
+       printk("%s\n", print_tainted());
        printk(  "--------------------------------------------\n");
        printk("%s/%d is deadlocking current task %s/%d\n\n",
               task->comm, task_pid_nr(task),
index f9d8482..a242e69 100644 (file)
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                    struct rt_mutex_waiter *waiter)
 {
        int ret = 0;
-       int was_disabled;
 
        for (;;) {
                /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
                raw_spin_unlock(&lock->wait_lock);
 
-               was_disabled = irqs_disabled();
-               if (was_disabled)
-                       local_irq_enable();
-
                debug_rt_mutex_print_deadlock(waiter);
 
                schedule_rt_mutex(lock);
 
-               if (was_disabled)
-                       local_irq_disable();
-
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
diff --git a/kernel/sched.c b/kernel/sched.c
deleted file mode 100644 (file)
index d6b149c..0000000
+++ /dev/null
@@ -1,9785 +0,0 @@
-/*
- *  kernel/sched.c
- *
- *  Kernel scheduler and related syscalls
- *
- *  Copyright (C) 1991-2002  Linus Torvalds
- *
- *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
- *             make semaphores SMP safe
- *  1998-11-19 Implemented schedule_timeout() and related stuff
- *             by Andrea Arcangeli
- *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- *             hybrid priority-list and round-robin design with
- *             an array-switch method of distributing timeslices
- *             and per-CPU runqueues.  Cleanups and useful suggestions
- *             by Davide Libenzi, preemptible kernel bits by Robert Love.
- *  2003-09-03 Interactivity tuning by Con Kolivas.
- *  2004-04-02 Scheduler domains code by Nick Piggin
- *  2007-04-15  Work begun on replacing all interactivity tuning with a
- *              fair scheduling design by Con Kolivas.
- *  2007-05-05  Load balancing (smp-nice) and other improvements
- *              by Peter Williams
- *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
- *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
- *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
- *              Thomas Gleixner, Mike Kravetz
- */
-
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <asm/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
-#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/stop_machine.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
-#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/debugfs.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
-#include <linux/init_task.h>
-
-#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#include <asm/mutex.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
-#include "sched_cpupri.h"
-#include "workqueue_sched.h"
-#include "sched_autogroup.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
-
-/*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-
-#define NICE_0_LOAD            SCHED_LOAD_SCALE
-#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define DEF_TIMESLICE          (100 * HZ / 1000)
-
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF    ((u64)~0ULL)
-
-static inline int rt_policy(int policy)
-{
-       if (policy == SCHED_FIFO || policy == SCHED_RR)
-               return 1;
-       return 0;
-}
-
-static inline int task_has_rt_policy(struct task_struct *p)
-{
-       return rt_policy(p->policy);
-}
-
-/*
- * This is the priority-queue data structure of the RT scheduling class:
- */
-struct rt_prio_array {
-       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-       struct list_head queue[MAX_RT_PRIO];
-};
-
-struct rt_bandwidth {
-       /* nests inside the rq lock: */
-       raw_spinlock_t          rt_runtime_lock;
-       ktime_t                 rt_period;
-       u64                     rt_runtime;
-       struct hrtimer          rt_period_timer;
-};
-
-static struct rt_bandwidth def_rt_bandwidth;
-
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-       struct rt_bandwidth *rt_b =
-               container_of(timer, struct rt_bandwidth, rt_period_timer);
-       ktime_t now;
-       int overrun;
-       int idle = 0;
-
-       for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
-               if (!overrun)
-                       break;
-
-               idle = do_sched_rt_period_timer(rt_b, overrun);
-       }
-
-       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-
-static
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
-       rt_b->rt_period = ns_to_ktime(period);
-       rt_b->rt_runtime = runtime;
-
-       raw_spin_lock_init(&rt_b->rt_runtime_lock);
-
-       hrtimer_init(&rt_b->rt_period_timer,
-                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-
-static inline int rt_bandwidth_enabled(void)
-{
-       return sysctl_sched_rt_runtime >= 0;
-}
-
-static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-       unsigned long delta;
-       ktime_t soft, hard, now;
-
-       for (;;) {
-               if (hrtimer_active(period_timer))
-                       break;
-
-               now = hrtimer_cb_get_time(period_timer);
-               hrtimer_forward(period_timer, now, period);
-
-               soft = hrtimer_get_softexpires(period_timer);
-               hard = hrtimer_get_expires(period_timer);
-               delta = ktime_to_ns(ktime_sub(hard, soft));
-               __hrtimer_start_range_ns(period_timer, soft, delta,
-                                        HRTIMER_MODE_ABS_PINNED, 0);
-       }
-}
-
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-               return;
-
-       if (hrtimer_active(&rt_b->rt_period_timer))
-               return;
-
-       raw_spin_lock(&rt_b->rt_runtime_lock);
-       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-       raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-       hrtimer_cancel(&rt_b->rt_period_timer);
-}
-#endif
-
-/*
- * sched_domains_mutex serializes calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-
-#ifdef CONFIG_CGROUP_SCHED
-
-#include <linux/cgroup.h>
-
-struct cfs_rq;
-
-static LIST_HEAD(task_groups);
-
-struct cfs_bandwidth {
-#ifdef CONFIG_CFS_BANDWIDTH
-       raw_spinlock_t lock;
-       ktime_t period;
-       u64 quota, runtime;
-       s64 hierarchal_quota;
-       u64 runtime_expires;
-
-       int idle, timer_active;
-       struct hrtimer period_timer, slack_timer;
-       struct list_head throttled_cfs_rq;
-
-       /* statistics */
-       int nr_periods, nr_throttled;
-       u64 throttled_time;
-#endif
-};
-
-/* task group related information */
-struct task_group {
-       struct cgroup_subsys_state css;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       /* schedulable entities of this group on each cpu */
-       struct sched_entity **se;
-       /* runqueue "owned" by this group on each cpu */
-       struct cfs_rq **cfs_rq;
-       unsigned long shares;
-
-       atomic_t load_weight;
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       struct sched_rt_entity **rt_se;
-       struct rt_rq **rt_rq;
-
-       struct rt_bandwidth rt_bandwidth;
-#endif
-
-       struct rcu_head rcu;
-       struct list_head list;
-
-       struct task_group *parent;
-       struct list_head siblings;
-       struct list_head children;
-
-#ifdef CONFIG_SCHED_AUTOGROUP
-       struct autogroup *autogroup;
-#endif
-
-       struct cfs_bandwidth cfs_bandwidth;
-};
-
-/* task_group_lock serializes the addition/removal of task groups */
-static DEFINE_SPINLOCK(task_group_lock);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-# define ROOT_TASK_GROUP_LOAD  NICE_0_LOAD
-
-/*
- * A weight of 0 or 1 can cause arithmetics problems.
- * A weight of a cfs_rq is the sum of weights of which entities
- * are queued on this cfs_rq, so a weight of a entity should not be
- * too large, so as the shares value of a task group.
- * (The default weight is 1024 - so there's no practical
- *  limitation from this.)
- */
-#define MIN_SHARES     (1UL <<  1)
-#define MAX_SHARES     (1UL << 18)
-
-static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
-#endif
-
-/* Default task group.
- *     Every task in system belong to this group at bootup.
- */
-struct task_group root_task_group;
-
-#endif /* CONFIG_CGROUP_SCHED */
-
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
-       struct load_weight load;
-       unsigned long nr_running, h_nr_running;
-
-       u64 exec_clock;
-       u64 min_vruntime;
-#ifndef CONFIG_64BIT
-       u64 min_vruntime_copy;
-#endif
-
-       struct rb_root tasks_timeline;
-       struct rb_node *rb_leftmost;
-
-       struct list_head tasks;
-       struct list_head *balance_iterator;
-
-       /*
-        * 'curr' points to currently running entity on this cfs_rq.
-        * It is set to NULL otherwise (i.e when none are currently running).
-        */
-       struct sched_entity *curr, *next, *last, *skip;
-
-#ifdef CONFIG_SCHED_DEBUG
-       unsigned int nr_spread_over;
-#endif
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
-
-       /*
-        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-        * (like users, containers etc.)
-        *
-        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-        * list is used during load balance.
-        */
-       int on_list;
-       struct list_head leaf_cfs_rq_list;
-       struct task_group *tg;  /* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
-       /*
-        * the part of load.weight contributed by tasks
-        */
-       unsigned long task_weight;
-
-       /*
-        *   h_load = weight * f(tg)
-        *
-        * Where f(tg) is the recursive weight fraction assigned to
-        * this group.
-        */
-       unsigned long h_load;
-
-       /*
-        * Maintaining per-cpu shares distribution for group scheduling
-        *
-        * load_stamp is the last time we updated the load average
-        * load_last is the last time we updated the load average and saw load
-        * load_unacc_exec_time is currently unaccounted execution time
-        */
-       u64 load_avg;
-       u64 load_period;
-       u64 load_stamp, load_last, load_unacc_exec_time;
-
-       unsigned long load_contribution;
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
-       int runtime_enabled;
-       u64 runtime_expires;
-       s64 runtime_remaining;
-
-       u64 throttled_timestamp;
-       int throttled, throttle_count;
-       struct list_head throttled_list;
-#endif
-#endif
-};
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_CFS_BANDWIDTH
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-       return &tg->cfs_bandwidth;
-}
-
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
-static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
-{
-       struct cfs_bandwidth *cfs_b =
-               container_of(timer, struct cfs_bandwidth, slack_timer);
-       do_sched_cfs_slack_timer(cfs_b);
-
-       return HRTIMER_NORESTART;
-}
-
-static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
-{
-       struct cfs_bandwidth *cfs_b =
-               container_of(timer, struct cfs_bandwidth, period_timer);
-       ktime_t now;
-       int overrun;
-       int idle = 0;
-
-       for (;;) {
-               now = hrtimer_cb_get_time(timer);
-               overrun = hrtimer_forward(timer, now, cfs_b->period);
-
-               if (!overrun)
-                       break;
-
-               idle = do_sched_cfs_period_timer(cfs_b, overrun);
-       }
-
-       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-       raw_spin_lock_init(&cfs_b->lock);
-       cfs_b->runtime = 0;
-       cfs_b->quota = RUNTIME_INF;
-       cfs_b->period = ns_to_ktime(default_cfs_period());
-
-       INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       cfs_b->period_timer.function = sched_cfs_period_timer;
-       hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       cfs_b->slack_timer.function = sched_cfs_slack_timer;
-}
-
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       cfs_rq->runtime_enabled = 0;
-       INIT_LIST_HEAD(&cfs_rq->throttled_list);
-}
-
-/* requires cfs_b->lock, may release to reprogram timer */
-static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-       /*
-        * The timer may be active because we're trying to set a new bandwidth
-        * period or because we're racing with the tear-down path
-        * (timer_active==0 becomes visible before the hrtimer call-back
-        * terminates).  In either case we ensure that it's re-programmed
-        */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
-               raw_spin_unlock(&cfs_b->lock);
-               /* ensure cfs_b->lock is available while we wait */
-               hrtimer_cancel(&cfs_b->period_timer);
-
-               raw_spin_lock(&cfs_b->lock);
-               /* if someone else restarted the timer then we're done */
-               if (cfs_b->timer_active)
-                       return;
-       }
-
-       cfs_b->timer_active = 1;
-       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-       hrtimer_cancel(&cfs_b->period_timer);
-       hrtimer_cancel(&cfs_b->slack_timer);
-}
-#else
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-       return NULL;
-}
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
-       struct rt_prio_array active;
-       unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       struct {
-               int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
-               int next; /* next highest */
-#endif
-       } highest_prio;
-#endif
-#ifdef CONFIG_SMP
-       unsigned long rt_nr_migratory;
-       unsigned long rt_nr_total;
-       int overloaded;
-       struct plist_head pushable_tasks;
-#endif
-       int rt_throttled;
-       u64 rt_time;
-       u64 rt_runtime;
-       /* Nests inside the rq lock: */
-       raw_spinlock_t rt_runtime_lock;
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       unsigned long rt_nr_boosted;
-
-       struct rq *rq;
-       struct list_head leaf_rt_rq_list;
-       struct task_group *tg;
-#endif
-};
-
-#ifdef CONFIG_SMP
-
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-       atomic_t refcount;
-       atomic_t rto_count;
-       struct rcu_head rcu;
-       cpumask_var_t span;
-       cpumask_var_t online;
-
-       /*
-        * The "RT overload" flag: it gets set if a CPU has more than
-        * one runnable RT task.
-        */
-       cpumask_var_t rto_mask;
-       struct cpupri cpupri;
-};
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-#endif /* CONFIG_SMP */
-
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct rq {
-       /* runqueue lock: */
-       raw_spinlock_t lock;
-
-       /*
-        * nr_running and cpu_load should be in the same cacheline because
-        * remote CPUs use both these fields when doing load calculation.
-        */
-       unsigned long nr_running;
-       #define CPU_LOAD_IDX_MAX 5
-       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-       unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
-       u64 nohz_stamp;
-       unsigned char nohz_balance_kick;
-#endif
-       int skip_clock_update;
-
-       /* capture load from *all* tasks on this cpu: */
-       struct load_weight load;
-       unsigned long nr_load_updates;
-       u64 nr_switches;
-
-       struct cfs_rq cfs;
-       struct rt_rq rt;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       /* list of leaf cfs_rq on this cpu: */
-       struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-       struct list_head leaf_rt_rq_list;
-#endif
-
-       /*
-        * This is part of a global counter where only the total sum
-        * over all CPUs matters. A task can increase this counter on
-        * one CPU and if it got migrated afterwards it may decrease
-        * it on another CPU. Always updated under the runqueue lock:
-        */
-       unsigned long nr_uninterruptible;
-
-       struct task_struct *curr, *idle, *stop;
-       unsigned long next_balance;
-       struct mm_struct *prev_mm;
-
-       u64 clock;
-       u64 clock_task;
-
-       atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
-       struct root_domain *rd;
-       struct sched_domain *sd;
-
-       unsigned long cpu_power;
-
-       unsigned char idle_balance;
-       /* For active balancing */
-       int post_schedule;
-       int active_balance;
-       int push_cpu;
-       struct cpu_stop_work active_balance_work;
-       /* cpu of this runqueue: */
-       int cpu;
-       int online;
-
-       u64 rt_avg;
-       u64 age_stamp;
-       u64 idle_stamp;
-       u64 avg_idle;
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       u64 prev_irq_time;
-#endif
-#ifdef CONFIG_PARAVIRT
-       u64 prev_steal_time;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       u64 prev_steal_time_rq;
-#endif
-
-       /* calc_load related fields */
-       unsigned long calc_load_update;
-       long calc_load_active;
-
-#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
-       int hrtick_csd_pending;
-       struct call_single_data hrtick_csd;
-#endif
-       struct hrtimer hrtick_timer;
-#endif
-
-#ifdef CONFIG_SCHEDSTATS
-       /* latency stats */
-       struct sched_info rq_sched_info;
-       unsigned long long rq_cpu_time;
-       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-
-       /* sys_sched_yield() stats */
-       unsigned int yld_count;
-
-       /* schedule() stats */
-       unsigned int sched_switch;
-       unsigned int sched_count;
-       unsigned int sched_goidle;
-
-       /* try_to_wake_up() stats */
-       unsigned int ttwu_count;
-       unsigned int ttwu_local;
-#endif
-
-#ifdef CONFIG_SMP
-       struct llist_head wake_list;
-#endif
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-
-
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-
-static inline int cpu_of(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       return rq->cpu;
-#else
-       return 0;
-#endif
-}
-
-#define rcu_dereference_check_sched_domain(p) \
-       rcu_dereference_check((p), \
-                             lockdep_is_held(&sched_domains_mutex))
-
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-       for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
-#define this_rq()              (&__get_cpu_var(runqueues))
-#define task_rq(p)             cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
-#define raw_rq()               (&__raw_get_cpu_var(runqueues))
-
-#ifdef CONFIG_CGROUP_SCHED
-
-/*
- * Return the group to which this tasks belongs.
- *
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
- */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       struct task_group *tg;
-       struct cgroup_subsys_state *css;
-
-       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                       lockdep_is_held(&p->pi_lock) ||
-                       lockdep_is_held(&task_rq(p)->lock));
-       tg = container_of(css, struct task_group, css);
-
-       return autogroup_task_group(p, tg);
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-       p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-       p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-
-#else /* CONFIG_CGROUP_SCHED */
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-       return NULL;
-}
-
-#endif /* CONFIG_CGROUP_SCHED */
-
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-static void update_rq_clock(struct rq *rq)
-{
-       s64 delta;
-
-       if (rq->skip_clock_update > 0)
-               return;
-
-       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
-       rq->clock += delta;
-       update_rq_clock_task(rq, delta);
-}
-
-/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-
-/**
- * runqueue_is_locked - Returns true if the current cpu runqueue is locked
- * @cpu: the processor in question.
- *
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-int runqueue_is_locked(int cpu)
-{
-       return raw_spin_is_locked(&cpu_rq(cpu)->lock);
-}
-
-/*
- * Debugging: various feature bits
- */
-
-#define SCHED_FEAT(name, enabled)      \
-       __SCHED_FEAT_##name ,
-
-enum {
-#include "sched_features.h"
-};
-
-#undef SCHED_FEAT
-
-#define SCHED_FEAT(name, enabled)      \
-       (1UL << __SCHED_FEAT_##name) * enabled |
-
-const_debug unsigned int sysctl_sched_features =
-#include "sched_features.h"
-       0;
-
-#undef SCHED_FEAT
-
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)      \
-       #name ,
-
-static __read_mostly char *sched_feat_names[] = {
-#include "sched_features.h"
-       NULL
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-       int i;
-
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (!(sysctl_sched_features & (1UL << i)))
-                       seq_puts(m, "NO_");
-               seq_printf(m, "%s ", sched_feat_names[i]);
-       }
-       seq_puts(m, "\n");
-
-       return 0;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char buf[64];
-       char *cmp;
-       int neg = 0;
-       int i;
-
-       if (cnt > 63)
-               cnt = 63;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt] = 0;
-       cmp = strstrip(buf);
-
-       if (strncmp(cmp, "NO_", 3) == 0) {
-               neg = 1;
-               cmp += 3;
-       }
-
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                       if (neg)
-                               sysctl_sched_features &= ~(1UL << i);
-                       else
-                               sysctl_sched_features |= (1UL << i);
-                       break;
-               }
-       }
-
-       if (!sched_feat_names[i])
-               return -EINVAL;
-
-       *ppos += cnt;
-
-       return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-       .open           = sched_feat_open,
-       .write          = sched_feat_write,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
-                       &sched_feat_fops);
-
-       return 0;
-}
-late_initcall(sched_init_debug);
-
-#endif
-
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-
-/*
- * Number of tasks to iterate in a single balance run.
- * Limited because this is done with IRQs disabled.
- */
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-
-/*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-
-/*
- * period over which we measure -rt task cpu usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
-
-static __read_mostly int scheduler_running;
-
-/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-
-static inline u64 global_rt_period(void)
-{
-       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-}
-
-static inline u64 global_rt_runtime(void)
-{
-       if (sysctl_sched_rt_runtime < 0)
-               return RUNTIME_INF;
-
-       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-}
-
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)     do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)      do { } while (0)
-#endif
-
-static inline int task_current(struct rq *rq, struct task_struct *p)
-{
-       return rq->curr == p;
-}
-
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-       return p->on_cpu;
-#else
-       return task_current(rq, p);
-#endif
-}
-
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-       /*
-        * We can optimise this out completely for !SMP, because the
-        * SMP rebalancing from interrupt is the only thing that cares
-        * here.
-        */
-       next->on_cpu = 1;
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-       /*
-        * After ->on_cpu is cleared, the task can be moved to a different CPU.
-        * We must ensure this doesn't happen until the switch is completely
-        * finished.
-        */
-       smp_wmb();
-       prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-       /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = current;
-#endif
-       /*
-        * If we are tracking spinlock dependencies then we have to
-        * fix up the runqueue lock - which gets 'carried over' from
-        * prev into current:
-        */
-       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
-       raw_spin_unlock_irq(&rq->lock);
-}
-
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-       /*
-        * We can optimise this out completely for !SMP, because the
-        * SMP rebalancing from interrupt is the only thing that cares
-        * here.
-        */
-       next->on_cpu = 1;
-#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       raw_spin_unlock_irq(&rq->lock);
-#else
-       raw_spin_unlock(&rq->lock);
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-       /*
-        * After ->on_cpu is cleared, the task can be moved to a different CPU.
-        * We must ensure this doesn't happen until the switch is completely
-        * finished.
-        */
-       smp_wmb();
-       prev->on_cpu = 0;
-#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       local_irq_enable();
-#endif
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
-
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       lockdep_assert_held(&p->pi_lock);
-
-       for (;;) {
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-       }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-       __acquires(p->pi_lock)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       for (;;) {
-               raw_spin_lock_irqsave(&p->pi_lock, *flags);
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-       }
-}
-
-static void __task_rq_unlock(struct rq *rq)
-       __releases(rq->lock)
-{
-       raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-       __releases(rq->lock)
-       __releases(p->pi_lock)
-{
-       raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       local_irq_disable();
-       rq = this_rq();
-       raw_spin_lock(&rq->lock);
-
-       return rq;
-}
-
-#ifdef CONFIG_SCHED_HRTICK
-/*
- * Use HR-timers to deliver accurate preemption points.
- *
- * Its all a bit involved since we cannot program an hrt while holding the
- * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
- * reschedule event.
- *
- * When we get rescheduled we reprogram the hrtick_timer outside of the
- * rq->lock.
- */
-
-/*
- * Use hrtick when:
- *  - enabled by features
- *  - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
-       if (!sched_feat(HRTICK))
-               return 0;
-       if (!cpu_active(cpu_of(rq)))
-               return 0;
-       return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
-
-static void hrtick_clear(struct rq *rq)
-{
-       if (hrtimer_active(&rq->hrtick_timer))
-               hrtimer_cancel(&rq->hrtick_timer);
-}
-
-/*
- * High-resolution timer tick.
- * Runs from hardirq context with interrupts disabled.
- */
-static enum hrtimer_restart hrtick(struct hrtimer *timer)
-{
-       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
-
-       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-
-       raw_spin_lock(&rq->lock);
-       update_rq_clock(rq);
-       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-       raw_spin_unlock(&rq->lock);
-
-       return HRTIMER_NORESTART;
-}
-
-#ifdef CONFIG_SMP
-/*
- * called from hardirq (IPI) context
- */
-static void __hrtick_start(void *arg)
-{
-       struct rq *rq = arg;
-
-       raw_spin_lock(&rq->lock);
-       hrtimer_restart(&rq->hrtick_timer);
-       rq->hrtick_csd_pending = 0;
-       raw_spin_unlock(&rq->lock);
-}
-
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay)
-{
-       struct hrtimer *timer = &rq->hrtick_timer;
-       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
-
-       hrtimer_set_expires(timer, time);
-
-       if (rq == this_rq()) {
-               hrtimer_restart(timer);
-       } else if (!rq->hrtick_csd_pending) {
-               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
-               rq->hrtick_csd_pending = 1;
-       }
-}
-
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       int cpu = (int)(long)hcpu;
-
-       switch (action) {
-       case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               hrtick_clear(cpu_rq(cpu));
-               return NOTIFY_OK;
-       }
-
-       return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
-       hotcpu_notifier(hotplug_hrtick, 0);
-}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and irqs disabled
- */
-static void hrtick_start(struct rq *rq, u64 delay)
-{
-       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                       HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static inline void init_hrtick(void)
-{
-}
-#endif /* CONFIG_SMP */
-
-static void init_rq_hrtick(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       rq->hrtick_csd_pending = 0;
-
-       rq->hrtick_csd.flags = 0;
-       rq->hrtick_csd.func = __hrtick_start;
-       rq->hrtick_csd.info = rq;
-#endif
-
-       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-       rq->hrtick_timer.function = hrtick;
-}
-#else  /* CONFIG_SCHED_HRTICK */
-static inline void hrtick_clear(struct rq *rq)
-{
-}
-
-static inline void init_rq_hrtick(struct rq *rq)
-{
-}
-
-static inline void init_hrtick(void)
-{
-}
-#endif /* CONFIG_SCHED_HRTICK */
-
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
-static void resched_task(struct task_struct *p)
-{
-       int cpu;
-
-       assert_raw_spin_locked(&task_rq(p)->lock);
-
-       if (test_tsk_need_resched(p))
-               return;
-
-       set_tsk_need_resched(p);
-
-       cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
-               return;
-
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
-               smp_send_reschedule(cpu);
-}
-
-static void resched_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
-       if (!raw_spin_trylock_irqsave(&rq->lock, flags))
-               return;
-       resched_task(cpu_curr(cpu));
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu.  This is good for power-savings.
- *
- * We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
- */
-int get_nohz_timer_target(void)
-{
-       int cpu = smp_processor_id();
-       int i;
-       struct sched_domain *sd;
-
-       rcu_read_lock();
-       for_each_domain(cpu, sd) {
-               for_each_cpu(i, sched_domain_span(sd)) {
-                       if (!idle_cpu(i)) {
-                               cpu = i;
-                               goto unlock;
-                       }
-               }
-       }
-unlock:
-       rcu_read_unlock();
-       return cpu;
-}
-/*
- * When add_timer_on() enqueues a timer into the timer wheel of an
- * idle CPU then this timer might expire before the next timer event
- * which is scheduled to wake up that CPU. In case of a completely
- * idle system the next event might even be infinite time into the
- * future. wake_up_idle_cpu() ensures that the CPU is woken up and
- * leaves the inner idle loop so the newly added timer is taken into
- * account when the CPU goes back to idle and evaluates the timer
- * wheel for the next timer event.
- */
-void wake_up_idle_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       if (cpu == smp_processor_id())
-               return;
-
-       /*
-        * This is safe, as this function is called with the timer
-        * wheel base lock of (cpu) held. When the CPU is on the way
-        * to idle and has not yet set rq->curr to idle then it will
-        * be serialized on the timer wheel base lock and take the new
-        * timer into account automatically.
-        */
-       if (rq->curr != rq->idle)
-               return;
-
-       /*
-        * We can set TIF_RESCHED on the idle task of the other CPU
-        * lockless. The worst case is that the other CPU runs the
-        * idle task through an additional NOOP schedule()
-        */
-       set_tsk_need_resched(rq->idle);
-
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(rq->idle))
-               smp_send_reschedule(cpu);
-}
-
-static inline bool got_nohz_idle_kick(void)
-{
-       return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
-}
-
-#else /* CONFIG_NO_HZ */
-
-static inline bool got_nohz_idle_kick(void)
-{
-       return false;
-}
-
-#endif /* CONFIG_NO_HZ */
-
-static u64 sched_avg_period(void)
-{
-       return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-
-static void sched_avg_update(struct rq *rq)
-{
-       s64 period = sched_avg_period();
-
-       while ((s64)(rq->clock - rq->age_stamp) > period) {
-               /*
-                * Inline assembly required to prevent the compiler
-                * optimising this loop into a divmod call.
-                * See __iter_div_u64_rem() for another example of this.
-                */
-               asm("" : "+rm" (rq->age_stamp));
-               rq->age_stamp += period;
-               rq->rt_avg /= 2;
-       }
-}
-
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-       rq->rt_avg += rt_delta;
-       sched_avg_update(rq);
-}
-
-#else /* !CONFIG_SMP */
-static void resched_task(struct task_struct *p)
-{
-       assert_raw_spin_locked(&task_rq(p)->lock);
-       set_tsk_need_resched(p);
-}
-
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-}
-
-static void sched_avg_update(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
-#if BITS_PER_LONG == 32
-# define WMULT_CONST   (~0UL)
-#else
-# define WMULT_CONST   (1UL << 32)
-#endif
-
-#define WMULT_SHIFT    32
-
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-               struct load_weight *lw)
-{
-       u64 tmp;
-
-       /*
-        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-        * 2^SCHED_LOAD_RESOLUTION.
-        */
-       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-               tmp = (u64)delta_exec * scale_load_down(weight);
-       else
-               tmp = (u64)delta_exec;
-
-       if (!lw->inv_weight) {
-               unsigned long w = scale_load_down(lw->weight);
-
-               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                       lw->inv_weight = 1;
-               else if (unlikely(!w))
-                       lw->inv_weight = WMULT_CONST;
-               else
-                       lw->inv_weight = WMULT_CONST / w;
-       }
-
-       /*
-        * Check whether we'd overflow the 64-bit multiplication:
-        */
-       if (unlikely(tmp > WMULT_CONST))
-               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                       WMULT_SHIFT/2);
-       else
-               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-
-       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
-       lw->weight += inc;
-       lw->inv_weight = 0;
-}
-
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
-       lw->weight -= dec;
-       lw->inv_weight = 0;
-}
-
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
-       lw->weight = w;
-       lw->inv_weight = 0;
-}
-
-/*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
- * of tasks with abnormal "nice" values across CPUs the contribution that
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
-
-#define WEIGHT_IDLEPRIO                3
-#define WMULT_IDLEPRIO         1431655765
-
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
-
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-       CPUACCT_STAT_USER,      /* ... user mode */
-       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
-
-       CPUACCT_STAT_NSTATS,
-};
-
-#ifdef CONFIG_CGROUP_CPUACCT
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-static void cpuacct_update_stats(struct task_struct *tsk,
-               enum cpuacct_stat_index idx, cputime_t val);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-static inline void cpuacct_update_stats(struct task_struct *tsk,
-               enum cpuacct_stat_index idx, cputime_t val) {}
-#endif
-
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-       update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-       update_load_sub(&rq->load, load);
-}
-
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
-                       (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
-typedef int (*tg_visitor)(struct task_group *, void *);
-
-/*
- * Iterate task_group tree rooted at *from, calling @down when first entering a
- * node and @up when leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-static int walk_tg_tree_from(struct task_group *from,
-                            tg_visitor down, tg_visitor up, void *data)
-{
-       struct task_group *parent, *child;
-       int ret;
-
-       parent = from;
-
-down:
-       ret = (*down)(parent, data);
-       if (ret)
-               goto out;
-       list_for_each_entry_rcu(child, &parent->children, siblings) {
-               parent = child;
-               goto down;
-
-up:
-               continue;
-       }
-       ret = (*up)(parent, data);
-       if (ret || parent == from)
-               goto out;
-
-       child = parent;
-       parent = parent->parent;
-       if (parent)
-               goto up;
-out:
-       return ret;
-}
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-
-static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-{
-       return walk_tg_tree_from(&root_task_group, down, up, data);
-}
-
-static int tg_nop(struct task_group *tg, void *data)
-{
-       return 0;
-}
-#endif
-
-#ifdef CONFIG_SMP
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-       return cpu_rq(cpu)->load.weight;
-}
-
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-
-       if (type == 0 || !sched_feat(LB_BIAS))
-               return total;
-
-       return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
-
-       if (type == 0 || !sched_feat(LB_BIAS))
-               return total;
-
-       return max(rq->cpu_load[type-1], total);
-}
-
-static unsigned long power_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_power;
-}
-
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
-
-       if (nr_running)
-               return rq->load.weight / nr_running;
-
-       return 0;
-}
-
-#ifdef CONFIG_PREEMPT
-
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-
-/*
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
- * way at the expense of forcing extra atomic operations in all
- * invocations.  This assures that the double_lock is acquired using the
- * same underlying policy as the spinlock_t on this architecture, which
- * reduces latency compared to the unfair variant below.  However, it
- * also adds more overhead and therefore may reduce throughput.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       raw_spin_unlock(&this_rq->lock);
-       double_rq_lock(this_rq, busiest);
-
-       return 1;
-}
-
-#else
-/*
- * Unfair double_lock_balance: Optimizes throughput at the expense of
- * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
- * regardless of entry order into the function.
- */
-static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!raw_spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       raw_spin_unlock(&this_rq->lock);
-                       raw_spin_lock(&busiest->lock);
-                       raw_spin_lock_nested(&this_rq->lock,
-                                             SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       raw_spin_lock_nested(&busiest->lock,
-                                             SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-#endif /* CONFIG_PREEMPT */
-
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               raw_spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-
-       return _double_lock_balance(this_rq, busiest);
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       raw_spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-       __acquires(rq1->lock)
-       __acquires(rq2->lock)
-{
-       BUG_ON(!irqs_disabled());
-       if (rq1 == rq2) {
-               raw_spin_lock(&rq1->lock);
-               __acquire(rq2->lock);   /* Fake it out ;) */
-       } else {
-               if (rq1 < rq2) {
-                       raw_spin_lock(&rq1->lock);
-                       raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-               } else {
-                       raw_spin_lock(&rq2->lock);
-                       raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-               }
-       }
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-       __releases(rq1->lock)
-       __releases(rq2->lock)
-{
-       raw_spin_unlock(&rq1->lock);
-       if (rq1 != rq2)
-               raw_spin_unlock(&rq2->lock);
-       else
-               __release(rq2->lock);
-}
-
-#else /* CONFIG_SMP */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-       __acquires(rq1->lock)
-       __acquires(rq2->lock)
-{
-       BUG_ON(!irqs_disabled());
-       BUG_ON(rq1 != rq2);
-       raw_spin_lock(&rq1->lock);
-       __acquire(rq2->lock);   /* Fake it out ;) */
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-       __releases(rq1->lock)
-       __releases(rq2->lock)
-{
-       BUG_ON(rq1 != rq2);
-       raw_spin_unlock(&rq1->lock);
-       __release(rq2->lock);
-}
-
-#endif
-
-static void calc_load_account_idle(struct rq *this_rq);
-static void update_sysctl(void);
-static int get_update_sysctl_factor(void);
-static void update_cpu_load(struct rq *this_rq);
-
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-       set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-       /*
-        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-        * successfully executed on another CPU. We must ensure that updates of
-        * per-task data have been completed by this moment.
-        */
-       smp_wmb();
-       task_thread_info(p)->cpu = cpu;
-#endif
-}
-
-static const struct sched_class rt_sched_class;
-
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
-   for (class = sched_class_highest; class; class = class->next)
-
-#include "sched_stats.h"
-
-static void inc_nr_running(struct rq *rq)
-{
-       rq->nr_running++;
-}
-
-static void dec_nr_running(struct rq *rq)
-{
-       rq->nr_running--;
-}
-
-static void set_load_weight(struct task_struct *p)
-{
-       int prio = p->static_prio - MAX_RT_PRIO;
-       struct load_weight *load = &p->se.load;
-
-       /*
-        * SCHED_IDLE tasks get minimal weight:
-        */
-       if (p->policy == SCHED_IDLE) {
-               load->weight = scale_load(WEIGHT_IDLEPRIO);
-               load->inv_weight = WMULT_IDLEPRIO;
-               return;
-       }
-
-       load->weight = scale_load(prio_to_weight[prio]);
-       load->inv_weight = prio_to_wmult[prio];
-}
-
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
-{
-       update_rq_clock(rq);
-       sched_info_queued(p);
-       p->sched_class->enqueue_task(rq, p, flags);
-}
-
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
-{
-       update_rq_clock(rq);
-       sched_info_dequeued(p);
-       p->sched_class->dequeue_task(rq, p, flags);
-}
-
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int flags)
-{
-       if (task_contributes_to_load(p))
-               rq->nr_uninterruptible--;
-
-       enqueue_task(rq, p, flags);
-}
-
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
-{
-       if (task_contributes_to_load(p))
-               rq->nr_uninterruptible++;
-
-       dequeue_task(rq, p, flags);
-}
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-       sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-       sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-       __this_cpu_inc(irq_time_seq.sequence);
-       smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-       smp_wmb();
-       __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-       u64 irq_time;
-       unsigned seq;
-
-       do {
-               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-               irq_time = per_cpu(cpu_softirq_time, cpu) +
-                          per_cpu(cpu_hardirq_time, cpu);
-       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
-       return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void account_system_vtime(struct task_struct *curr)
-{
-       unsigned long flags;
-       s64 delta;
-       int cpu;
-
-       if (!sched_clock_irqtime)
-               return;
-
-       local_irq_save(flags);
-
-       cpu = smp_processor_id();
-       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-       __this_cpu_add(irq_start_time, delta);
-
-       irq_time_write_begin();
-       /*
-        * We do not account for softirq time from ksoftirqd here.
-        * We want to continue accounting softirq time to ksoftirqd thread
-        * in that case, so as not to confuse scheduler with a special task
-        * that do not consume any time, but still wants to run.
-        */
-       if (hardirq_count())
-               __this_cpu_add(cpu_hardirq_time, delta);
-       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-               __this_cpu_add(cpu_softirq_time, delta);
-
-       irq_time_write_end();
-       local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-       if (unlikely(steal > NSEC_PER_SEC))
-               return div_u64(steal, TICK_NSEC);
-
-       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-       s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
-       /*
-        * Since irq_time is only updated on {soft,}irq_exit, we might run into
-        * this case when a previous update_rq_clock() happened inside a
-        * {soft,}irq region.
-        *
-        * When this happens, we stop ->clock_task and only update the
-        * prev_irq_time stamp to account for the part that fit, so that a next
-        * update will consume the rest. This ensures ->clock_task is
-        * monotonic.
-        *
-        * It does however cause some slight miss-attribution of {soft,}irq
-        * time, a more accurate solution would be to update the irq_time using
-        * the current rq->clock timestamp, except that would require using
-        * atomic ops.
-        */
-       if (irq_delta > delta)
-               irq_delta = delta;
-
-       rq->prev_irq_time += irq_delta;
-       delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-       if (static_branch((&paravirt_steal_rq_enabled))) {
-               u64 st;
-
-               steal = paravirt_steal_clock(cpu_of(rq));
-               steal -= rq->prev_steal_time_rq;
-
-               if (unlikely(steal > delta))
-                       steal = delta;
-
-               st = steal_ticks(steal);
-               steal = st * TICK_NSEC;
-
-               rq->prev_steal_time_rq += steal;
-
-               delta -= steal;
-       }
-#endif
-
-       rq->clock_task += delta;
-
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
-               sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static int irqtime_account_hi_update(void)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
-
-       local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_hardirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
-               ret = 1;
-       local_irq_restore(flags);
-       return ret;
-}
-
-static int irqtime_account_si_update(void)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       unsigned long flags;
-       u64 latest_ns;
-       int ret = 0;
-
-       local_irq_save(flags);
-       latest_ns = this_cpu_read(cpu_softirq_time);
-       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
-               ret = 1;
-       local_irq_restore(flags);
-       return ret;
-}
-
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime    (0)
-
-#endif
-
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#include "sched_autogroup.c"
-#include "sched_stoptask.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
-
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
-       struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
-       if (stop) {
-               /*
-                * Make it appear like a SCHED_FIFO task, its something
-                * userspace knows about and won't get confused about.
-                *
-                * Also, it will make PI more or less work without too
-                * much confusion -- but then, stop work should not
-                * rely on PI working anyway.
-                */
-               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-
-               stop->sched_class = &stop_sched_class;
-       }
-
-       cpu_rq(cpu)->stop = stop;
-
-       if (old_stop) {
-               /*
-                * Reset it back to a normal scheduling class so that
-                * it can die in pieces.
-                */
-               old_stop->sched_class = &rt_sched_class;
-       }
-}
-
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
-{
-       return p->static_prio;
-}
-
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
-{
-       int prio;
-
-       if (task_has_rt_policy(p))
-               prio = MAX_RT_PRIO-1 - p->rt_priority;
-       else
-               prio = __normal_prio(p);
-       return prio;
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
-       p->normal_prio = normal_prio(p);
-       /*
-        * If we are RT tasks or we were boosted to RT priority,
-        * keep the priority unchanged. Otherwise, update priority
-        * to the normal priority:
-        */
-       if (!rt_prio(p->prio))
-               return p->normal_prio;
-       return p->prio;
-}
-
-/**
- * task_curr - is this task currently executing on a CPU?
- * @p: the task in question.
- */
-inline int task_curr(const struct task_struct *p)
-{
-       return cpu_curr(task_cpu(p)) == p;
-}
-
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
-                                      const struct sched_class *prev_class,
-                                      int oldprio)
-{
-       if (prev_class != p->sched_class) {
-               if (prev_class->switched_from)
-                       prev_class->switched_from(rq, p);
-               p->sched_class->switched_to(rq, p);
-       } else if (oldprio != p->prio)
-               p->sched_class->prio_changed(rq, p, oldprio);
-}
-
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-       const struct sched_class *class;
-
-       if (p->sched_class == rq->curr->sched_class) {
-               rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-       } else {
-               for_each_class(class) {
-                       if (class == rq->curr->sched_class)
-                               break;
-                       if (class == p->sched_class) {
-                               resched_task(rq->curr);
-                               break;
-                       }
-               }
-       }
-
-       /*
-        * A queue event has occurred, and we're going to schedule.  In
-        * this case, we can save a useless back to back clock update.
-        */
-       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
-               rq->skip_clock_update = 1;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Is this task likely cache-hot:
- */
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-       s64 delta;
-
-       if (p->sched_class != &fair_sched_class)
-               return 0;
-
-       if (unlikely(p->policy == SCHED_IDLE))
-               return 0;
-
-       /*
-        * Buddy candidates are cache hot:
-        */
-       if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
-                       (&p->se == cfs_rq_of(&p->se)->next ||
-                        &p->se == cfs_rq_of(&p->se)->last))
-               return 1;
-
-       if (sysctl_sched_migration_cost == -1)
-               return 1;
-       if (sysctl_sched_migration_cost == 0)
-               return 0;
-
-       delta = now - p->se.exec_start;
-
-       return delta < (s64)sysctl_sched_migration_cost;
-}
-
-void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       /*
-        * We should never call set_task_cpu() on a blocked task,
-        * ttwu() will sort out the placement.
-        */
-       WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
-
-#ifdef CONFIG_LOCKDEP
-       /*
-        * The caller should hold either p->pi_lock or rq->lock, when changing
-        * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
-        *
-        * sched_move_task() holds both and thus holding either pins the cgroup,
-        * see set_task_rq().
-        *
-        * Furthermore, all task_rq users should acquire both locks, see
-        * task_rq_lock().
-        */
-       WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
-                                     lockdep_is_held(&task_rq(p)->lock)));
-#endif
-#endif
-
-       trace_sched_migrate_task(p, new_cpu);
-
-       if (task_cpu(p) != new_cpu) {
-               p->se.nr_migrations++;
-               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
-       }
-
-       __set_task_cpu(p, new_cpu);
-}
-
-struct migration_arg {
-       struct task_struct *task;
-       int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change.  If it changes, i.e. @p might have woken up,
- * then return zero.  When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count).  If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
-{
-       unsigned long flags;
-       int running, on_rq;
-       unsigned long ncsw;
-       struct rq *rq;
-
-       for (;;) {
-               /*
-                * We do the initial early heuristics without holding
-                * any task-queue locks at all. We'll only try to get
-                * the runqueue lock when things look like they will
-                * work out!
-                */
-               rq = task_rq(p);
-
-               /*
-                * If the task is actively running on another CPU
-                * still, just relax and busy-wait without holding
-                * any locks.
-                *
-                * NOTE! Since we don't hold any locks, it's not
-                * even sure that "rq" stays as the right runqueue!
-                * But we don't care, since "task_running()" will
-                * return false if the runqueue has changed and p
-                * is actually now running somewhere else!
-                */
-               while (task_running(rq, p)) {
-                       if (match_state && unlikely(p->state != match_state))
-                               return 0;
-                       cpu_relax();
-               }
-
-               /*
-                * Ok, time to look more closely! We need the rq
-                * lock now, to be *sure*. If we're wrong, we'll
-                * just go back and repeat.
-                */
-               rq = task_rq_lock(p, &flags);
-               trace_sched_wait_task(p);
-               running = task_running(rq, p);
-               on_rq = p->on_rq;
-               ncsw = 0;
-               if (!match_state || p->state == match_state)
-                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, p, &flags);
-
-               /*
-                * If it changed from the expected state, bail out now.
-                */
-               if (unlikely(!ncsw))
-                       break;
-
-               /*
-                * Was it really running after all now that we
-                * checked with the proper locks actually held?
-                *
-                * Oops. Go back and try again..
-                */
-               if (unlikely(running)) {
-                       cpu_relax();
-                       continue;
-               }
-
-               /*
-                * It's not enough that it's not actively running,
-                * it must be off the runqueue _entirely_, and not
-                * preempted!
-                *
-                * So if it was still runnable (but just not actively
-                * running right now), it's preempted, and we should
-                * yield - it could be a while.
-                */
-               if (unlikely(on_rq)) {
-                       ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
-
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       schedule_hrtimeout(&to, HRTIMER_MODE_REL);
-                       continue;
-               }
-
-               /*
-                * Ahh, all good. It wasn't running, and it wasn't
-                * runnable, which means that it will never become
-                * running in the future either. We're all done!
-                */
-               break;
-       }
-
-       return ncsw;
-}
-
-/***
- * kick_process - kick a running thread to enter/exit the kernel
- * @p: the to-be-kicked thread
- *
- * Cause a process which is running on another CPU to enter
- * kernel-mode, without any delay. (to get signals handled.)
- *
- * NOTE: this function doesn't have to take the runqueue lock,
- * because all it wants to ensure is that the remote task enters
- * the kernel. If the IPI races and the task has been migrated
- * to another CPU then no harm is done and the purpose has been
- * achieved as well.
- */
-void kick_process(struct task_struct *p)
-{
-       int cpu;
-
-       preempt_disable();
-       cpu = task_cpu(p);
-       if ((cpu != smp_processor_id()) && task_curr(p))
-               smp_send_reschedule(cpu);
-       preempt_enable();
-}
-EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_SMP
-/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
- */
-static int select_fallback_rq(int cpu, struct task_struct *p)
-{
-       int dest_cpu;
-       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
-
-       /* Look for allowed, online CPU in same node. */
-       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-               if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-                       return dest_cpu;
-
-       /* Any allowed, online CPU? */
-       dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
-       if (dest_cpu < nr_cpu_ids)
-               return dest_cpu;
-
-       /* No more Mr. Nice Guy. */
-       dest_cpu = cpuset_cpus_allowed_fallback(p);
-       /*
-        * Don't tell them about moving exiting tasks or
-        * kernel threads (both mm NULL), since they never
-        * leave kernel.
-        */
-       if (p->mm && printk_ratelimit()) {
-               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
-                               task_pid_nr(p), p->comm, cpu);
-       }
-
-       return dest_cpu;
-}
-
-/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
- */
-static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
-{
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
-
-       /*
-        * In order not to call set_task_cpu() on a blocking task we need
-        * to rely on ttwu() to place the task on a valid ->cpus_allowed
-        * cpu.
-        *
-        * Since this is common to all placement strategies, this lives here.
-        *
-        * [ this allows ->select_task() to simply return task_cpu(p) and
-        *   not worry about this generic constraint ]
-        */
-       if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
-                    !cpu_online(cpu)))
-               cpu = select_fallback_rq(task_cpu(p), p);
-
-       return cpu;
-}
-
-static void update_avg(u64 *avg, u64 sample)
-{
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-#endif
-
-static void
-ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-{
-#ifdef CONFIG_SCHEDSTATS
-       struct rq *rq = this_rq();
-
-#ifdef CONFIG_SMP
-       int this_cpu = smp_processor_id();
-
-       if (cpu == this_cpu) {
-               schedstat_inc(rq, ttwu_local);
-               schedstat_inc(p, se.statistics.nr_wakeups_local);
-       } else {
-               struct sched_domain *sd;
-
-               schedstat_inc(p, se.statistics.nr_wakeups_remote);
-               rcu_read_lock();
-               for_each_domain(this_cpu, sd) {
-                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
-               }
-               rcu_read_unlock();
-       }
-
-       if (wake_flags & WF_MIGRATED)
-               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-
-#endif /* CONFIG_SMP */
-
-       schedstat_inc(rq, ttwu_count);
-       schedstat_inc(p, se.statistics.nr_wakeups);
-
-       if (wake_flags & WF_SYNC)
-               schedstat_inc(p, se.statistics.nr_wakeups_sync);
-
-#endif /* CONFIG_SCHEDSTATS */
-}
-
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
-       activate_task(rq, p, en_flags);
-       p->on_rq = 1;
-
-       /* if a worker is waking up, notify workqueue */
-       if (p->flags & PF_WQ_WORKER)
-               wq_worker_waking_up(p, cpu_of(rq));
-}
-
-/*
- * Mark the task runnable and perform wakeup-preemption.
- */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-       trace_sched_wakeup(p, true);
-       check_preempt_curr(rq, p, wake_flags);
-
-       p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
-               p->sched_class->task_woken(rq, p);
-
-       if (rq->idle_stamp) {
-               u64 delta = rq->clock - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
-
-               if (delta > max)
-                       rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
-               rq->idle_stamp = 0;
-       }
-#endif
-}
-
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-#ifdef CONFIG_SMP
-       if (p->sched_contributes_to_load)
-               rq->nr_uninterruptible--;
-#endif
-
-       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-       ttwu_do_wakeup(rq, p, wake_flags);
-}
-
-/*
- * Called in case the task @p isn't fully descheduled from its runqueue,
- * in this case we must do a remote wakeup. Its a 'light' wakeup though,
- * since all we need to do is flip p->state to TASK_RUNNING, since
- * the task is still ->on_rq.
- */
-static int ttwu_remote(struct task_struct *p, int wake_flags)
-{
-       struct rq *rq;
-       int ret = 0;
-
-       rq = __task_rq_lock(p);
-       if (p->on_rq) {
-               ttwu_do_wakeup(rq, p, wake_flags);
-               ret = 1;
-       }
-       __task_rq_unlock(rq);
-
-       return ret;
-}
-
-#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
-{
-       struct rq *rq = this_rq();
-       struct llist_node *llist = llist_del_all(&rq->wake_list);
-       struct task_struct *p;
-
-       raw_spin_lock(&rq->lock);
-
-       while (llist) {
-               p = llist_entry(llist, struct task_struct, wake_entry);
-               llist = llist_next(llist);
-               ttwu_do_activate(rq, p, 0);
-       }
-
-       raw_spin_unlock(&rq->lock);
-}
-
-void scheduler_ipi(void)
-{
-       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-               return;
-
-       /*
-        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
-        * traditionally all their work was done from the interrupt return
-        * path. Now that we actually do some work, we need to make sure
-        * we do call them.
-        *
-        * Some archs already do call them, luckily irq_enter/exit nest
-        * properly.
-        *
-        * Arguably we should visit all archs and update all handlers,
-        * however a fair share of IPIs are still resched only so this would
-        * somewhat pessimize the simple resched case.
-        */
-       irq_enter();
-       sched_ttwu_pending();
-
-       /*
-        * Check if someone kicked us for doing the nohz idle load balance.
-        */
-       if (unlikely(got_nohz_idle_kick() && !need_resched())) {
-               this_rq()->idle_balance = 1;
-               raise_softirq_irqoff(SCHED_SOFTIRQ);
-       }
-       irq_exit();
-}
-
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
-{
-       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-               smp_send_reschedule(cpu);
-}
-
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
-{
-       struct rq *rq;
-       int ret = 0;
-
-       rq = __task_rq_lock(p);
-       if (p->on_cpu) {
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-               ttwu_do_wakeup(rq, p, wake_flags);
-               ret = 1;
-       }
-       __task_rq_unlock(rq);
-
-       return ret;
-
-}
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-#endif /* CONFIG_SMP */
-
-static void ttwu_queue(struct task_struct *p, int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-#if defined(CONFIG_SMP)
-       if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
-               sched_clock_cpu(cpu); /* sync clocks x-cpu */
-               ttwu_queue_remote(p, cpu);
-               return;
-       }
-#endif
-
-       raw_spin_lock(&rq->lock);
-       ttwu_do_activate(rq, p, 0);
-       raw_spin_unlock(&rq->lock);
-}
-
-/**
- * try_to_wake_up - wake up a thread
- * @p: the thread to be awakened
- * @state: the mask of task states that can be woken
- * @wake_flags: wake modifier flags (WF_*)
- *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Returns %true if @p was woken up, %false if it was already running
- * or @state didn't match @p's state.
- */
-static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
-{
-       unsigned long flags;
-       int cpu, success = 0;
-
-       smp_wmb();
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       if (!(p->state & state))
-               goto out;
-
-       success = 1; /* we're going to change ->state */
-       cpu = task_cpu(p);
-
-       if (p->on_rq && ttwu_remote(p, wake_flags))
-               goto stat;
-
-#ifdef CONFIG_SMP
-       /*
-        * If the owning (remote) cpu is still in the middle of schedule() with
-        * this task as prev, wait until its done referencing the task.
-        */
-       while (p->on_cpu) {
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-               /*
-                * In case the architecture enables interrupts in
-                * context_switch(), we cannot busy wait, since that
-                * would lead to deadlocks when an interrupt hits and
-                * tries to wake up @prev. So bail and do a complete
-                * remote wakeup.
-                */
-               if (ttwu_activate_remote(p, wake_flags))
-                       goto stat;
-#else
-               cpu_relax();
-#endif
-       }
-       /*
-        * Pairs with the smp_wmb() in finish_lock_switch().
-        */
-       smp_rmb();
-
-       p->sched_contributes_to_load = !!task_contributes_to_load(p);
-       p->state = TASK_WAKING;
-
-       if (p->sched_class->task_waking)
-               p->sched_class->task_waking(p);
-
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (task_cpu(p) != cpu) {
-               wake_flags |= WF_MIGRATED;
-               set_task_cpu(p, cpu);
-       }
-#endif /* CONFIG_SMP */
-
-       ttwu_queue(p, cpu);
-stat:
-       ttwu_stat(p, cpu, wake_flags);
-out:
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-       return success;
-}
-
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p)
-{
-       struct rq *rq = task_rq(p);
-
-       BUG_ON(rq != this_rq());
-       BUG_ON(p == current);
-       lockdep_assert_held(&rq->lock);
-
-       if (!raw_spin_trylock(&p->pi_lock)) {
-               raw_spin_unlock(&rq->lock);
-               raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
-       }
-
-       if (!(p->state & TASK_NORMAL))
-               goto out;
-
-       if (!p->on_rq)
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
-       ttwu_do_wakeup(rq, p, 0);
-       ttwu_stat(p, smp_processor_id(), 0);
-out:
-       raw_spin_unlock(&p->pi_lock);
-}
-
-/**
- * wake_up_process - Wake up a specific process
- * @p: The process to be woken up.
- *
- * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
- * running.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-int wake_up_process(struct task_struct *p)
-{
-       return try_to_wake_up(p, TASK_ALL, 0);
-}
-EXPORT_SYMBOL(wake_up_process);
-
-int wake_up_state(struct task_struct *p, unsigned int state)
-{
-       return try_to_wake_up(p, state, 0);
-}
-
-/*
- * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
- *
- * __sched_fork() is basic setup used by init_idle() too:
- */
-static void __sched_fork(struct task_struct *p)
-{
-       p->on_rq                        = 0;
-
-       p->se.on_rq                     = 0;
-       p->se.exec_start                = 0;
-       p->se.sum_exec_runtime          = 0;
-       p->se.prev_sum_exec_runtime     = 0;
-       p->se.nr_migrations             = 0;
-       p->se.vruntime                  = 0;
-       INIT_LIST_HEAD(&p->se.group_node);
-
-#ifdef CONFIG_SCHEDSTATS
-       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-#endif
-
-       INIT_LIST_HEAD(&p->rt.run_list);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&p->preempt_notifiers);
-#endif
-}
-
-/*
- * fork()/clone()-time setup:
- */
-void sched_fork(struct task_struct *p)
-{
-       unsigned long flags;
-       int cpu = get_cpu();
-
-       __sched_fork(p);
-       /*
-        * We mark the process as running here. This guarantees that
-        * nobody will actually run it, and a signal or other external
-        * event cannot wake it up and insert it on the runqueue either.
-        */
-       p->state = TASK_RUNNING;
-
-       /*
-        * Make sure we do not leak PI boosting priority to the child.
-        */
-       p->prio = current->normal_prio;
-
-       /*
-        * Revert to default priority/policy on fork if requested.
-        */
-       if (unlikely(p->sched_reset_on_fork)) {
-               if (task_has_rt_policy(p)) {
-                       p->policy = SCHED_NORMAL;
-                       p->static_prio = NICE_TO_PRIO(0);
-                       p->rt_priority = 0;
-               } else if (PRIO_TO_NICE(p->static_prio) < 0)
-                       p->static_prio = NICE_TO_PRIO(0);
-
-               p->prio = p->normal_prio = __normal_prio(p);
-               set_load_weight(p);
-
-               /*
-                * We don't need the reset flag anymore after the fork. It has
-                * fulfilled its duty:
-                */
-               p->sched_reset_on_fork = 0;
-       }
-
-       if (!rt_prio(p->prio))
-               p->sched_class = &fair_sched_class;
-
-       if (p->sched_class->task_fork)
-               p->sched_class->task_fork(p);
-
-       /*
-        * The child is not yet in the pid-hash so no cgroup attach races,
-        * and the cgroup is pinned to this child due to cgroup_fork()
-        * is ran before sched_fork().
-        *
-        * Silence PROVE_RCU.
-        */
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-       if (likely(sched_info_on()))
-               memset(&p->sched_info, 0, sizeof(p->sched_info));
-#endif
-#if defined(CONFIG_SMP)
-       p->on_cpu = 0;
-#endif
-#ifdef CONFIG_PREEMPT_COUNT
-       /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
-#endif
-#ifdef CONFIG_SMP
-       plist_node_init(&p->pushable_tasks, MAX_PRIO);
-#endif
-
-       put_cpu();
-}
-
-/*
- * wake_up_new_task - wake up a newly created task for the first time.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, then puts the task
- * on the runqueue and wakes it.
- */
-void wake_up_new_task(struct task_struct *p)
-{
-       unsigned long flags;
-       struct rq *rq;
-
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-#ifdef CONFIG_SMP
-       /*
-        * Fork balancing, do it here and not earlier because:
-        *  - cpus_allowed can change in the fork path
-        *  - any previously selected cpu might disappear through hotplug
-        */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-#endif
-
-       rq = __task_rq_lock(p);
-       activate_task(rq, p, 0);
-       p->on_rq = 1;
-       trace_sched_wakeup_new(p, true);
-       check_preempt_curr(rq, p, WF_FORK);
-#ifdef CONFIG_SMP
-       if (p->sched_class->task_woken)
-               p->sched_class->task_woken(rq, p);
-#endif
-       task_rq_unlock(rq, p, &flags);
-}
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-
-/**
- * preempt_notifier_register - tell me when current is being preempted & rescheduled
- * @notifier: notifier struct to register
- */
-void preempt_notifier_register(struct preempt_notifier *notifier)
-{
-       hlist_add_head(&notifier->link, &current->preempt_notifiers);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_register);
-
-/**
- * preempt_notifier_unregister - no longer interested in preemption notifications
- * @notifier: notifier struct to unregister
- *
- * This is safe to call from within a preemption notifier.
- */
-void preempt_notifier_unregister(struct preempt_notifier *notifier)
-{
-       hlist_del(&notifier->link);
-}
-EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-       struct preempt_notifier *notifier;
-       struct hlist_node *node;
-
-       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
-               notifier->ops->sched_in(notifier, raw_smp_processor_id());
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                struct task_struct *next)
-{
-       struct preempt_notifier *notifier;
-       struct hlist_node *node;
-
-       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
-               notifier->ops->sched_out(notifier, next);
-}
-
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
-
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
-{
-}
-
-static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
-                                struct task_struct *next)
-{
-}
-
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
-
-/**
- * prepare_task_switch - prepare to switch tasks
- * @rq: the runqueue preparing to switch
- * @prev: the current task that is being switched out
- * @next: the task we are going to switch to.
- *
- * This is called with the rq lock held and interrupts off. It must
- * be paired with a subsequent finish_task_switch after the context
- * switch.
- *
- * prepare_task_switch sets up locking and calls architecture specific
- * hooks.
- */
-static inline void
-prepare_task_switch(struct rq *rq, struct task_struct *prev,
-                   struct task_struct *next)
-{
-       sched_info_switch(prev, next);
-       perf_event_task_sched_out(prev, next);
-       fire_sched_out_preempt_notifiers(prev, next);
-       prepare_lock_switch(rq, next);
-       prepare_arch_switch(next);
-       trace_sched_switch(prev, next);
-}
-
-/**
- * finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
- * @prev: the thread we just switched away from.
- *
- * finish_task_switch must be called after the context switch, paired
- * with a prepare_task_switch call before the context switch.
- * finish_task_switch will reconcile locking set up by prepare_task_switch,
- * and do any other architecture-specific cleanup actions.
- *
- * Note that we may have delayed dropping an mm in context_switch(). If
- * so, we finish that here outside of the runqueue lock. (Doing it
- * with the lock held can cause deadlocks; see schedule() for
- * details.)
- */
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
-       __releases(rq->lock)
-{
-       struct mm_struct *mm = rq->prev_mm;
-       long prev_state;
-
-       rq->prev_mm = NULL;
-
-       /*
-        * A task struct has one reference for the use as "current".
-        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
-        * schedule one last time. The schedule call will never return, and
-        * the scheduled task must drop that reference.
-        * The test for TASK_DEAD must occur while the runqueue locks are
-        * still held, otherwise prev could be scheduled on another cpu, die
-        * there before we look at prev->state, and then the reference would
-        * be dropped twice.
-        *              Manfred Spraul <manfred@colorfullife.com>
-        */
-       prev_state = prev->state;
-       finish_arch_switch(prev);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       local_irq_disable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-       perf_event_task_sched_in(prev, current);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-       local_irq_enable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-       finish_lock_switch(rq, prev);
-
-       fire_sched_in_preempt_notifiers(current);
-       if (mm)
-               mmdrop(mm);
-       if (unlikely(prev_state == TASK_DEAD)) {
-               /*
-                * Remove function-return probe instances associated with this
-                * task and put them back on the free list.
-                */
-               kprobe_flush_task(prev);
-               put_task_struct(prev);
-       }
-}
-
-#ifdef CONFIG_SMP
-
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-       if (prev->sched_class->pre_schedule)
-               prev->sched_class->pre_schedule(rq, prev);
-}
-
-/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
-{
-       if (rq->post_schedule) {
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->curr->sched_class->post_schedule)
-                       rq->curr->sched_class->post_schedule(rq);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-               rq->post_schedule = 0;
-       }
-}
-
-#else
-
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void post_schedule(struct rq *rq)
-{
-}
-
-#endif
-
-/**
- * schedule_tail - first thing a freshly forked thread must call.
- * @prev: the thread we just switched away from.
- */
-asmlinkage void schedule_tail(struct task_struct *prev)
-       __releases(rq->lock)
-{
-       struct rq *rq = this_rq();
-
-       finish_task_switch(rq, prev);
-
-       /*
-        * FIXME: do we need to worry about rq being invalidated by the
-        * task_switch?
-        */
-       post_schedule(rq);
-
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-       /* In this case, finish_task_switch does not reenable preemption */
-       preempt_enable();
-#endif
-       if (current->set_child_tid)
-               put_user(task_pid_vnr(current), current->set_child_tid);
-}
-
-/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
- */
-static inline void
-context_switch(struct rq *rq, struct task_struct *prev,
-              struct task_struct *next)
-{
-       struct mm_struct *mm, *oldmm;
-
-       prepare_task_switch(rq, prev, next);
-
-       mm = next->mm;
-       oldmm = prev->active_mm;
-       /*
-        * For paravirt, this is coupled with an exit in switch_to to
-        * combine the page table reload and the switch backend into
-        * one hypercall.
-        */
-       arch_start_context_switch(prev);
-
-       if (!mm) {
-               next->active_mm = oldmm;
-               atomic_inc(&oldmm->mm_count);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm(oldmm, mm, next);
-
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
-       }
-       /*
-        * Since the runqueue lock will be released by the next
-        * task (which is an invalid locking op but in the case
-        * of the scheduler it's an obvious special-case), so we
-        * do an early lockdep release here:
-        */
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
-
-       /* Here we just switch the register state and the stack. */
-       switch_to(prev, next, prev);
-
-       barrier();
-       /*
-        * this_rq must be evaluated again because prev may have moved
-        * CPUs since it called schedule(), thus the 'rq' on its stack
-        * frame will be invalid.
-        */
-       finish_task_switch(this_rq(), prev);
-}
-
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
- * number of context switches performed since bootup.
- */
-unsigned long nr_running(void)
-{
-       unsigned long i, sum = 0;
-
-       for_each_online_cpu(i)
-               sum += cpu_rq(i)->nr_running;
-
-       return sum;
-}
-
-unsigned long nr_uninterruptible(void)
-{
-       unsigned long i, sum = 0;
-
-       for_each_possible_cpu(i)
-               sum += cpu_rq(i)->nr_uninterruptible;
-
-       /*
-        * Since we read the counters lockless, it might be slightly
-        * inaccurate. Do not allow it to go below zero though:
-        */
-       if (unlikely((long)sum < 0))
-               sum = 0;
-
-       return sum;
-}
-
-unsigned long long nr_context_switches(void)
-{
-       int i;
-       unsigned long long sum = 0;
-
-       for_each_possible_cpu(i)
-               sum += cpu_rq(i)->nr_switches;
-
-       return sum;
-}
-
-unsigned long nr_iowait(void)
-{
-       unsigned long i, sum = 0;
-
-       for_each_possible_cpu(i)
-               sum += atomic_read(&cpu_rq(i)->nr_iowait);
-
-       return sum;
-}
-
-unsigned long nr_iowait_cpu(int cpu)
-{
-       struct rq *this = cpu_rq(cpu);
-       return atomic_read(&this->nr_iowait);
-}
-
-unsigned long this_cpu_load(void)
-{
-       struct rq *this = this_rq();
-       return this->cpu_load[0];
-}
-
-
-/* Variables and functions for calc_load */
-static atomic_long_t calc_load_tasks;
-static unsigned long calc_load_update;
-unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
-
-static long calc_load_fold_active(struct rq *this_rq)
-{
-       long nr_active, delta = 0;
-
-       nr_active = this_rq->nr_running;
-       nr_active += (long) this_rq->nr_uninterruptible;
-
-       if (nr_active != this_rq->calc_load_active) {
-               delta = nr_active - this_rq->calc_load_active;
-               this_rq->calc_load_active = nr_active;
-       }
-
-       return delta;
-}
-
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-       load *= exp;
-       load += active * (FIXED_1 - exp);
-       load += 1UL << (FSHIFT - 1);
-       return load >> FSHIFT;
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
- *
- * When making the ILB scale, we should try to pull this in as well.
- */
-static atomic_long_t calc_load_tasks_idle;
-
-static void calc_load_account_idle(struct rq *this_rq)
-{
-       long delta;
-
-       delta = calc_load_fold_active(this_rq);
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks_idle);
-}
-
-static long calc_load_fold_idle(void)
-{
-       long delta = 0;
-
-       /*
-        * Its got a race, we don't care...
-        */
-       if (atomic_long_read(&calc_load_tasks_idle))
-               delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
-
-       return delta;
-}
-
-/**
- * fixed_power_int - compute: x^n, in O(log n) time
- *
- * @x:         base of the power
- * @frac_bits: fractional bits of @x
- * @n:         power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- */
-static unsigned long
-fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
-{
-       unsigned long result = 1UL << frac_bits;
-
-       if (n) for (;;) {
-               if (n & 1) {
-                       result *= x;
-                       result += 1UL << (frac_bits - 1);
-                       result >>= frac_bits;
-               }
-               n >>= 1;
-               if (!n)
-                       break;
-               x *= x;
-               x += 1UL << (frac_bits - 1);
-               x >>= frac_bits;
-       }
-
-       return result;
-}
-
-/*
- * a1 = a0 * e + a * (1 - e)
- *
- * a2 = a1 * e + a * (1 - e)
- *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
- *    = a0 * e^2 + a * (1 - e) * (1 + e)
- *
- * a3 = a2 * e + a * (1 - e)
- *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
- *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
- *
- *  ...
- *
- * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
- *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
- *    = a0 * e^n + a * (1 - e^n)
- *
- * [1] application of the geometric series:
- *
- *              n         1 - x^(n+1)
- *     S_n := \Sum x^i = -------------
- *             i=0          1 - x
- */
-static unsigned long
-calc_load_n(unsigned long load, unsigned long exp,
-           unsigned long active, unsigned int n)
-{
-
-       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
-}
-
-/*
- * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
- *
- * Once we've updated the global active value, we need to apply the exponential
- * weights adjusted to the number of cycles missed.
- */
-static void calc_global_nohz(unsigned long ticks)
-{
-       long delta, active, n;
-
-       if (time_before(jiffies, calc_load_update))
-               return;
-
-       /*
-        * If we crossed a calc_load_update boundary, make sure to fold
-        * any pending idle changes, the respective CPUs might have
-        * missed the tick driven calc_load_account_active() update
-        * due to NO_HZ.
-        */
-       delta = calc_load_fold_idle();
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       /*
-        * If we were idle for multiple load cycles, apply them.
-        */
-       if (ticks >= LOAD_FREQ) {
-               n = ticks / LOAD_FREQ;
-
-               active = atomic_long_read(&calc_load_tasks);
-               active = active > 0 ? active * FIXED_1 : 0;
-
-               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-
-               calc_load_update += n * LOAD_FREQ;
-       }
-
-       /*
-        * Its possible the remainder of the above division also crosses
-        * a LOAD_FREQ period, the regular check in calc_global_load()
-        * which comes after this will take care of that.
-        *
-        * Consider us being 11 ticks before a cycle completion, and us
-        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
-        * age us 4 cycles, and the test in calc_global_load() will
-        * pick up the final one.
-        */
-}
-#else
-static void calc_load_account_idle(struct rq *this_rq)
-{
-}
-
-static inline long calc_load_fold_idle(void)
-{
-       return 0;
-}
-
-static void calc_global_nohz(unsigned long ticks)
-{
-}
-#endif
-
-/**
- * get_avenrun - get the load average array
- * @loads:     pointer to dest load array
- * @offset:    offset to add
- * @shift:     shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-       loads[0] = (avenrun[0] + offset) << shift;
-       loads[1] = (avenrun[1] + offset) << shift;
-       loads[2] = (avenrun[2] + offset) << shift;
-}
-
-/*
- * calc_load - update the avenrun load estimates 10 ticks after the
- * CPUs have updated calc_load_tasks.
- */
-void calc_global_load(unsigned long ticks)
-{
-       long active;
-
-       calc_global_nohz(ticks);
-
-       if (time_before(jiffies, calc_load_update + 10))
-               return;
-
-       active = atomic_long_read(&calc_load_tasks);
-       active = active > 0 ? active * FIXED_1 : 0;
-
-       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
-       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
-       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
-
-       calc_load_update += LOAD_FREQ;
-}
-
-/*
- * Called from update_cpu_load() to periodically update this CPU's
- * active count.
- */
-static void calc_load_account_active(struct rq *this_rq)
-{
-       long delta;
-
-       if (time_before(jiffies, this_rq->calc_load_update))
-               return;
-
-       delta  = calc_load_fold_active(this_rq);
-       delta += calc_load_fold_idle();
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-
-       this_rq->calc_load_update += LOAD_FREQ;
-}
-
-/*
- * The exact cpuload at various idx values, calculated at every tick would be
- * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
- *
- * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
- * on nth tick when cpu may be busy, then we have:
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- * load = ((2^idx - 1) / 2^idx)^(n-1) * load
- * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
- *
- * The calculation is approximated on a 128 point scale.
- * degrade_zero_ticks is the number of ticks after which load at any
- * particular idx is approximated to be zero.
- * degrade_factor is a precomputed table, a row for each load idx.
- * Each column corresponds to degradation factor for a power of two ticks,
- * based on 128 point scale.
- * Example:
- * row 2, col 3 (=12) says that the degradation at load idx 2 after
- * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
- *
- * With this power of 2 load factors, we can degrade the load n times
- * by looking at 1 bits in n and doing as many mult/shift instead of
- * n mult/shifts needed by the exact degradation.
- */
-#define DEGRADE_SHIFT          7
-static const unsigned char
-               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const unsigned char
-               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-                                       {0, 0, 0, 0, 0, 0, 0, 0},
-                                       {64, 32, 8, 0, 0, 0, 0, 0},
-                                       {96, 72, 40, 12, 1, 0, 0},
-                                       {112, 98, 75, 43, 15, 1, 0},
-                                       {120, 112, 98, 76, 45, 16, 2} };
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-       int j = 0;
-
-       if (!missed_updates)
-               return load;
-
-       if (missed_updates >= degrade_zero_ticks[idx])
-               return 0;
-
-       if (idx == 1)
-               return load >> missed_updates;
-
-       while (missed_updates) {
-               if (missed_updates % 2)
-                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
-               missed_updates >>= 1;
-               j++;
-       }
-       return load;
-}
-
-/*
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC). With tickless idle this will not be called
- * every tick. We fix it up based on jiffies.
- */
-static void update_cpu_load(struct rq *this_rq)
-{
-       unsigned long this_load = this_rq->load.weight;
-       unsigned long curr_jiffies = jiffies;
-       unsigned long pending_updates;
-       int i, scale;
-
-       this_rq->nr_load_updates++;
-
-       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-       if (curr_jiffies == this_rq->last_load_update_tick)
-               return;
-
-       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-       this_rq->last_load_update_tick = curr_jiffies;
-
-       /* Update our load: */
-       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-               unsigned long old_load, new_load;
-
-               /* scale is effectively 1 << i now, and >> i divides by scale */
-
-               old_load = this_rq->cpu_load[i];
-               old_load = decay_load_missed(old_load, pending_updates - 1, i);
-               new_load = this_load;
-               /*
-                * Round up the averaging division if load is increasing. This
-                * prevents us from getting stuck on 9 if the load is 10, for
-                * example.
-                */
-               if (new_load > old_load)
-                       new_load += scale - 1;
-
-               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-       }
-
-       sched_avg_update(this_rq);
-}
-
-static void update_cpu_load_active(struct rq *this_rq)
-{
-       update_cpu_load(this_rq);
-
-       calc_load_account_active(this_rq);
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-       struct task_struct *p = current;
-       unsigned long flags;
-       int dest_cpu;
-
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
-       if (dest_cpu == smp_processor_id())
-               goto unlock;
-
-       if (likely(cpu_active(dest_cpu))) {
-               struct migration_arg arg = { p, dest_cpu };
-
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
-               return;
-       }
-unlock:
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-}
-
-#endif
-
-DEFINE_PER_CPU(struct kernel_stat, kstat);
-
-EXPORT_PER_CPU_SYMBOL(kstat);
-
-/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
-       u64 ns = 0;
-
-       if (task_current(rq, p)) {
-               update_rq_clock(rq);
-               ns = rq->clock_task - p->se.exec_start;
-               if ((s64)ns < 0)
-                       ns = 0;
-       }
-
-       return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
-       unsigned long flags;
-       struct rq *rq;
-       u64 ns = 0;
-
-       rq = task_rq_lock(p, &flags);
-       ns = do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, p, &flags);
-
-       return ns;
-}
-
-/*
- * Return accounted runtime for the task.
- * In case the task is currently running, return the runtime plus current's
- * pending runtime that have not been accounted yet.
- */
-unsigned long long task_sched_runtime(struct task_struct *p)
-{
-       unsigned long flags;
-       struct rq *rq;
-       u64 ns = 0;
-
-       rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-       task_rq_unlock(rq, p, &flags);
-
-       return ns;
-}
-
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-                      cputime_t cputime_scaled)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
-
-       /* Add user time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
-       account_group_user_time(p, cputime);
-
-       /* Add user time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
-       if (TASK_NICE(p) > 0)
-               cpustat->nice = cputime64_add(cpustat->nice, tmp);
-       else
-               cpustat->user = cputime64_add(cpustat->user, tmp);
-
-       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
-       /* Account for user time used */
-       acct_update_integrals(p);
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-                              cputime_t cputime_scaled)
-{
-       cputime64_t tmp;
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-
-       tmp = cputime_to_cputime64(cputime);
-
-       /* Add guest time to process. */
-       p->utime = cputime_add(p->utime, cputime);
-       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
-       account_group_user_time(p, cputime);
-       p->gtime = cputime_add(p->gtime, cputime);
-
-       /* Add guest time to cpustat. */
-       if (TASK_NICE(p) > 0) {
-               cpustat->nice = cputime64_add(cpustat->nice, tmp);
-               cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
-       } else {
-               cpustat->user = cputime64_add(cpustat->user, tmp);
-               cpustat->guest = cputime64_add(cpustat->guest, tmp);
-       }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
-{
-       cputime64_t tmp = cputime_to_cputime64(cputime);
-
-       /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
-       /* Account for system time used */
-       acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime, cputime_t cputime_scaled)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t *target_cputime64;
-
-       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime, cputime_scaled);
-               return;
-       }
-
-       if (hardirq_count() - hardirq_offset)
-               target_cputime64 = &cpustat->irq;
-       else if (in_serving_softirq())
-               target_cputime64 = &cpustat->softirq;
-       else
-               target_cputime64 = &cpustat->system;
-
-       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
-}
-
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t cputime64 = cputime_to_cputime64(cputime);
-
-       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t cputime64 = cputime_to_cputime64(cputime);
-       struct rq *rq = this_rq();
-
-       if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
-       else
-               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
-}
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-       if (static_branch(&paravirt_steal_enabled)) {
-               u64 steal, st = 0;
-
-               steal = paravirt_steal_clock(smp_processor_id());
-               steal -= this_rq()->prev_steal_time;
-
-               st = steal_ticks(steal);
-               this_rq()->prev_steal_time += st * TICK_NSEC;
-
-               account_steal_time(st);
-               return st;
-       }
-#endif
-       return false;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- *   - check for guest_time
- *   - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq)
-{
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
-       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-
-       if (steal_account_process_tick())
-               return;
-
-       if (irqtime_account_hi_update()) {
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       } else if (irqtime_account_si_update()) {
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       } else if (this_cpu_ksoftirqd() == p) {
-               /*
-                * ksoftirqd time do not get accounted in cpu_softirq_time.
-                * So, we have to handle it separately here.
-                * Also, p->stime needs to be updated for ksoftirqd.
-                */
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       &cpustat->softirq);
-       } else if (user_tick) {
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       } else if (p == rq->idle) {
-               account_idle_time(cputime_one_jiffy);
-       } else if (p->flags & PF_VCPU) { /* System time or guest time */
-               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       } else {
-               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                       &cpustat->system);
-       }
-}
-
-static void irqtime_account_idle_ticks(int ticks)
-{
-       int i;
-       struct rq *rq = this_rq();
-
-       for (i = 0; i < ticks; i++)
-               irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                               struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-       struct rq *rq = this_rq();
-
-       if (sched_clock_irqtime) {
-               irqtime_account_process_tick(p, user_tick, rq);
-               return;
-       }
-
-       if (steal_account_process_tick())
-               return;
-
-       if (user_tick)
-               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-               account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-                                   one_jiffy_scaled);
-       else
-               account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-       account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
-       if (sched_clock_irqtime) {
-               irqtime_account_idle_ticks(ticks);
-               return;
-       }
-
-       account_idle_time(jiffies_to_cputime(ticks));
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       *ut = p->utime;
-       *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct task_cputime cputime;
-
-       thread_group_cputime(p, &cputime);
-
-       *ut = cputime.utime;
-       *st = cputime.stime;
-}
-#else
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)     nsecs_to_jiffies(__nsecs)
-#endif
-
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
-
-       /*
-        * Use CFS's precise accounting:
-        */
-       rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-
-       if (total) {
-               u64 temp = rtime;
-
-               temp *= utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
-       } else
-               utime = rtime;
-
-       /*
-        * Compare with previous values, to keep monotonicity:
-        */
-       p->prev_utime = max(p->prev_utime, utime);
-       p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
-
-       *ut = p->prev_utime;
-       *st = p->prev_stime;
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-       struct signal_struct *sig = p->signal;
-       struct task_cputime cputime;
-       cputime_t rtime, utime, total;
-
-       thread_group_cputime(p, &cputime);
-
-       total = cputime_add(cputime.utime, cputime.stime);
-       rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
-       if (total) {
-               u64 temp = rtime;
-
-               temp *= cputime.utime;
-               do_div(temp, total);
-               utime = (cputime_t)temp;
-       } else
-               utime = rtime;
-
-       sig->prev_utime = max(sig->prev_utime, utime);
-       sig->prev_stime = max(sig->prev_stime,
-                             cputime_sub(rtime, sig->prev_utime));
-
-       *ut = sig->prev_utime;
-       *st = sig->prev_stime;
-}
-#endif
-
-/*
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- */
-void scheduler_tick(void)
-{
-       int cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(cpu);
-       struct task_struct *curr = rq->curr;
-
-       sched_clock_tick();
-
-       raw_spin_lock(&rq->lock);
-       update_rq_clock(rq);
-       update_cpu_load_active(rq);
-       curr->sched_class->task_tick(rq, curr, 0);
-       raw_spin_unlock(&rq->lock);
-
-       perf_event_task_tick();
-
-#ifdef CONFIG_SMP
-       rq->idle_balance = idle_cpu(cpu);
-       trigger_load_balance(rq, cpu);
-#endif
-}
-
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-       if (in_lock_functions(addr)) {
-               addr = CALLER_ADDR2;
-               if (in_lock_functions(addr))
-                       addr = CALLER_ADDR3;
-       }
-       return addr;
-}
-
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-                               defined(CONFIG_PREEMPT_TRACER))
-
-void __kprobes add_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Underflow?
-        */
-       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
-               return;
-#endif
-       preempt_count() += val;
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Spinlock count overflowing soon?
-        */
-       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
-                               PREEMPT_MASK - 10);
-#endif
-       if (preempt_count() == val)
-               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-}
-EXPORT_SYMBOL(add_preempt_count);
-
-void __kprobes sub_preempt_count(int val)
-{
-#ifdef CONFIG_DEBUG_PREEMPT
-       /*
-        * Underflow?
-        */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
-               return;
-       /*
-        * Is the spinlock portion underflowing?
-        */
-       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
-                       !(preempt_count() & PREEMPT_MASK)))
-               return;
-#endif
-
-       if (preempt_count() == val)
-               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-       preempt_count() -= val;
-}
-EXPORT_SYMBOL(sub_preempt_count);
-
-#endif
-
-/*
- * Print scheduling while atomic bug:
- */
-static noinline void __schedule_bug(struct task_struct *prev)
-{
-       struct pt_regs *regs = get_irq_regs();
-
-       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-               prev->comm, prev->pid, preempt_count());
-
-       debug_show_held_locks(prev);
-       print_modules();
-       if (irqs_disabled())
-               print_irqtrace_events(prev);
-
-       if (regs)
-               show_regs(regs);
-       else
-               dump_stack();
-}
-
-/*
- * Various schedule()-time debugging checks and statistics:
- */
-static inline void schedule_debug(struct task_struct *prev)
-{
-       /*
-        * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path for now.
-        * Otherwise, whine if we are scheduling when we should not be.
-        */
-       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
-               __schedule_bug(prev);
-       rcu_sleep_check();
-
-       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-
-       schedstat_inc(this_rq(), sched_count);
-}
-
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-       if (prev->on_rq || rq->skip_clock_update < 0)
-               update_rq_clock(rq);
-       prev->sched_class->put_prev_task(rq, prev);
-}
-
-/*
- * Pick up the highest-prio task:
- */
-static inline struct task_struct *
-pick_next_task(struct rq *rq)
-{
-       const struct sched_class *class;
-       struct task_struct *p;
-
-       /*
-        * Optimization: we know that if all tasks are in
-        * the fair class we can call that function directly:
-        */
-       if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-               p = fair_sched_class.pick_next_task(rq);
-               if (likely(p))
-                       return p;
-       }
-
-       for_each_class(class) {
-               p = class->pick_next_task(rq);
-               if (p)
-                       return p;
-       }
-
-       BUG(); /* the idle class will always have a runnable task */
-}
-
-/*
- * __schedule() is the main scheduler function.
- */
-static void __sched __schedule(void)
-{
-       struct task_struct *prev, *next;
-       unsigned long *switch_count;
-       struct rq *rq;
-       int cpu;
-
-need_resched:
-       preempt_disable();
-       cpu = smp_processor_id();
-       rq = cpu_rq(cpu);
-       rcu_note_context_switch(cpu);
-       prev = rq->curr;
-
-       schedule_debug(prev);
-
-       if (sched_feat(HRTICK))
-               hrtick_clear(rq);
-
-       raw_spin_lock_irq(&rq->lock);
-
-       switch_count = &prev->nivcsw;
-       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-               if (unlikely(signal_pending_state(prev->state, prev))) {
-                       prev->state = TASK_RUNNING;
-               } else {
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
-                       prev->on_rq = 0;
-
-                       /*
-                        * If a worker went to sleep, notify and ask workqueue
-                        * whether it wants to wake up a task to maintain
-                        * concurrency.
-                        */
-                       if (prev->flags & PF_WQ_WORKER) {
-                               struct task_struct *to_wakeup;
-
-                               to_wakeup = wq_worker_sleeping(prev, cpu);
-                               if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup);
-                       }
-               }
-               switch_count = &prev->nvcsw;
-       }
-
-       pre_schedule(rq, prev);
-
-       if (unlikely(!rq->nr_running))
-               idle_balance(cpu, rq);
-
-       put_prev_task(rq, prev);
-       next = pick_next_task(rq);
-       clear_tsk_need_resched(prev);
-       rq->skip_clock_update = 0;
-
-       if (likely(prev != next)) {
-               rq->nr_switches++;
-               rq->curr = next;
-               ++*switch_count;
-
-               context_switch(rq, prev, next); /* unlocks the rq */
-               /*
-                * The context switch have flipped the stack from under us
-                * and restored the local variables which were saved when
-                * this task called schedule() in the past. prev == current
-                * is still correct, but it can be moved to another cpu/rq.
-                */
-               cpu = smp_processor_id();
-               rq = cpu_rq(cpu);
-       } else
-               raw_spin_unlock_irq(&rq->lock);
-
-       post_schedule(rq);
-
-       preempt_enable_no_resched();
-       if (need_resched())
-               goto need_resched;
-}
-
-static inline void sched_submit_work(struct task_struct *tsk)
-{
-       if (!tsk->state)
-               return;
-       /*
-        * If we are going to sleep and we have plugged IO queued,
-        * make sure to submit it to avoid deadlocks.
-        */
-       if (blk_needs_flush_plug(tsk))
-               blk_schedule_flush_plug(tsk);
-}
-
-asmlinkage void __sched schedule(void)
-{
-       struct task_struct *tsk = current;
-
-       sched_submit_work(tsk);
-       __schedule();
-}
-EXPORT_SYMBOL(schedule);
-
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-       if (lock->owner != owner)
-               return false;
-
-       /*
-        * Ensure we emit the owner->on_cpu, dereference _after_ checking
-        * lock->owner still matches owner, if that fails, owner might
-        * point to free()d memory, if it still matches, the rcu_read_lock()
-        * ensures the memory stays valid.
-        */
-       barrier();
-
-       return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-       if (!sched_feat(OWNER_SPIN))
-               return 0;
-
-       rcu_read_lock();
-       while (owner_running(lock, owner)) {
-               if (need_resched())
-                       break;
-
-               arch_mutex_cpu_relax();
-       }
-       rcu_read_unlock();
-
-       /*
-        * We break out the loop above on need_resched() and when the
-        * owner changed, which is a sign for heavy contention. Return
-        * success only when lock->owner is NULL.
-        */
-       return lock->owner == NULL;
-}
-#endif
-
-#ifdef CONFIG_PREEMPT
-/*
- * this is the entry point to schedule() from in-kernel preemption
- * off of preempt_enable. Kernel preemptions off return from interrupt
- * occur there and call schedule directly.
- */
-asmlinkage void __sched notrace preempt_schedule(void)
-{
-       struct thread_info *ti = current_thread_info();
-
-       /*
-        * If there is a non-zero preempt_count or interrupts are disabled,
-        * we do not want to preempt the current task. Just return..
-        */
-       if (likely(ti->preempt_count || irqs_disabled()))
-               return;
-
-       do {
-               add_preempt_count_notrace(PREEMPT_ACTIVE);
-               __schedule();
-               sub_preempt_count_notrace(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
-       } while (need_resched());
-}
-EXPORT_SYMBOL(preempt_schedule);
-
-/*
- * this is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
- */
-asmlinkage void __sched preempt_schedule_irq(void)
-{
-       struct thread_info *ti = current_thread_info();
-
-       /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
-
-       do {
-               add_preempt_count(PREEMPT_ACTIVE);
-               local_irq_enable();
-               __schedule();
-               local_irq_disable();
-               sub_preempt_count(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
-       } while (need_resched());
-}
-
-#endif /* CONFIG_PREEMPT */
-
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
-                         void *key)
-{
-       return try_to_wake_up(curr->private, mode, wake_flags);
-}
-EXPORT_SYMBOL(default_wake_function);
-
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, int wake_flags, void *key)
-{
-       wait_queue_t *curr, *next;
-
-       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-               unsigned flags = curr->flags;
-
-               if (curr->func(curr, mode, wake_flags, key) &&
-                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-                       break;
-       }
-}
-
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, void *key)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, 0, key);
-       spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
-{
-       __wake_up_common(q, mode, 1, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
-       __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, void *key)
-{
-       unsigned long flags;
-       int wake_flags = WF_SYNC;
-
-       if (unlikely(!q))
-               return;
-
-       if (unlikely(!nr_exclusive))
-               wake_flags = 0;
-
-       spin_lock_irqsave(&q->lock, flags);
-       __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-       spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
-       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
-
-/**
- * complete: - signals a single thread waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       x->done++;
-       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       x->done += UINT_MAX/2;
-       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-
-static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
-{
-       if (!x->done) {
-               DECLARE_WAITQUEUE(wait, current);
-
-               __add_wait_queue_tail_exclusive(&x->wait, &wait);
-               do {
-                       if (signal_pending_state(state, current)) {
-                               timeout = -ERESTARTSYS;
-                               break;
-                       }
-                       __set_current_state(state);
-                       spin_unlock_irq(&x->wait.lock);
-                       timeout = schedule_timeout(timeout);
-                       spin_lock_irq(&x->wait.lock);
-               } while (!x->done && timeout);
-               __remove_wait_queue(&x->wait, &wait);
-               if (!x->done)
-                       return timeout;
-       }
-       x->done--;
-       return timeout ?: 1;
-}
-
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
-       might_sleep();
-
-       spin_lock_irq(&x->wait.lock);
-       timeout = do_wait_for_common(x, timeout, state);
-       spin_unlock_irq(&x->wait.lock);
-       return timeout;
-}
-
-/**
- * wait_for_completion: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
-       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x:  holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
-       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
-       if (t == -ERESTARTSYS)
-               return t;
-       return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-                                         unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
-       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-       if (t == -ERESTARTSYS)
-               return t;
-       return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
-                                    unsigned long timeout)
-{
-       return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-
-/**
- *     try_wait_for_completion - try to decrement a completion without blocking
- *     @x:     completion structure
- *
- *     Returns: 0 if a decrement cannot be done without blocking
- *              1 if a decrement succeeded.
- *
- *     If a completion is being used as a counting completion,
- *     attempt to decrement the counter without blocking. This
- *     enables us to avoid waiting if the resource the completion
- *     is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       if (!x->done)
-               ret = 0;
-       else
-               x->done--;
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-       return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-
-/**
- *     completion_done - Test to see if a completion has any waiters
- *     @x:     completion structure
- *
- *     Returns: 0 if there are waiters (wait_for_completion() in progress)
- *              1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
-       unsigned long flags;
-       int ret = 1;
-
-       spin_lock_irqsave(&x->wait.lock, flags);
-       if (!x->done)
-               ret = 0;
-       spin_unlock_irqrestore(&x->wait.lock, flags);
-       return ret;
-}
-EXPORT_SYMBOL(completion_done);
-
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
-       unsigned long flags;
-       wait_queue_t wait;
-
-       init_waitqueue_entry(&wait, current);
-
-       __set_current_state(state);
-
-       spin_lock_irqsave(&q->lock, flags);
-       __add_wait_queue(q, &wait);
-       spin_unlock(&q->lock);
-       timeout = schedule_timeout(timeout);
-       spin_lock_irq(&q->lock);
-       __remove_wait_queue(q, &wait);
-       spin_unlock_irqrestore(&q->lock, flags);
-
-       return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
-       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
-       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
-#ifdef CONFIG_RT_MUTEXES
-
-/*
- * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
- *
- * This function changes the 'effective' priority of a task. It does
- * not touch ->normal_prio like __setscheduler().
- *
- * Used by the rt_mutex code to implement priority inheritance logic.
- */
-void rt_mutex_setprio(struct task_struct *p, int prio)
-{
-       int oldprio, on_rq, running;
-       struct rq *rq;
-       const struct sched_class *prev_class;
-
-       BUG_ON(prio < 0 || prio > MAX_PRIO);
-
-       rq = __task_rq_lock(p);
-
-       trace_sched_pi_setprio(p, prio);
-       oldprio = p->prio;
-       prev_class = p->sched_class;
-       on_rq = p->on_rq;
-       running = task_current(rq, p);
-       if (on_rq)
-               dequeue_task(rq, p, 0);
-       if (running)
-               p->sched_class->put_prev_task(rq, p);
-
-       if (rt_prio(prio))
-               p->sched_class = &rt_sched_class;
-       else
-               p->sched_class = &fair_sched_class;
-
-       p->prio = prio;
-
-       if (running)
-               p->sched_class->set_curr_task(rq);
-       if (on_rq)
-               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
-
-       check_class_changed(rq, p, prev_class, oldprio);
-       __task_rq_unlock(rq);
-}
-
-#endif
-
-void set_user_nice(struct task_struct *p, long nice)
-{
-       int old_prio, delta, on_rq;
-       unsigned long flags;
-       struct rq *rq;
-
-       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
-               return;
-       /*
-        * We have to be careful, if called from sys_setpriority(),
-        * the task might be in the middle of scheduling on another CPU.
-        */
-       rq = task_rq_lock(p, &flags);
-       /*
-        * The RT priorities are set via sched_setscheduler(), but we still
-        * allow the 'normal' nice value to be set - but as expected
-        * it wont have any effect on scheduling until the task is
-        * SCHED_FIFO/SCHED_RR:
-        */
-       if (task_has_rt_policy(p)) {
-               p->static_prio = NICE_TO_PRIO(nice);
-               goto out_unlock;
-       }
-       on_rq = p->on_rq;
-       if (on_rq)
-               dequeue_task(rq, p, 0);
-
-       p->static_prio = NICE_TO_PRIO(nice);
-       set_load_weight(p);
-       old_prio = p->prio;
-       p->prio = effective_prio(p);
-       delta = p->prio - old_prio;
-
-       if (on_rq) {
-               enqueue_task(rq, p, 0);
-               /*
-                * If the task increased its priority or is running and
-                * lowered its priority, then reschedule its CPU:
-                */
-               if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                       resched_task(rq->curr);
-       }
-out_unlock:
-       task_rq_unlock(rq, p, &flags);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
-       /* convert nice value [19,-20] to rlimit style value [1,40] */
-       int nice_rlim = 20 - nice;
-
-       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-               capable(CAP_SYS_NICE));
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
-       long nice, retval;
-
-       /*
-        * Setpriority might change our priority at the same moment.
-        * We don't have to worry. Conceptually one call occurs first
-        * and we have a single winner.
-        */
-       if (increment < -40)
-               increment = -40;
-       if (increment > 40)
-               increment = 40;
-
-       nice = TASK_NICE(current) + increment;
-       if (nice < -20)
-               nice = -20;
-       if (nice > 19)
-               nice = 19;
-
-       if (increment < 0 && !can_nice(current, nice))
-               return -EPERM;
-
-       retval = security_task_setnice(current, nice);
-       if (retval)
-               return retval;
-
-       set_user_nice(current, nice);
-       return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * This is the priority value as seen by users in /proc.
- * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
- */
-int task_prio(const struct task_struct *p)
-{
-       return p->prio - MAX_RT_PRIO;
-}
-
-/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- */
-int task_nice(const struct task_struct *p)
-{
-       return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
-/**
- * idle_cpu - is a given cpu idle currently?
- * @cpu: the processor in question.
- */
-int idle_cpu(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       if (rq->curr != rq->idle)
-               return 0;
-
-       if (rq->nr_running)
-               return 0;
-
-#ifdef CONFIG_SMP
-       if (!llist_empty(&rq->wake_list))
-               return 0;
-#endif
-
-       return 1;
-}
-
-/**
- * idle_task - return the idle task for a given cpu.
- * @cpu: the processor in question.
- */
-struct task_struct *idle_task(int cpu)
-{
-       return cpu_rq(cpu)->idle;
-}
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
-{
-       return pid ? find_task_by_vpid(pid) : current;
-}
-
-/* Actually do priority change: must hold rq lock. */
-static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
-{
-       p->policy = policy;
-       p->rt_priority = prio;
-       p->normal_prio = normal_prio(p);
-       /* we are holding p->pi_lock already */
-       p->prio = rt_mutex_getprio(p);
-       if (rt_prio(p->prio))
-               p->sched_class = &rt_sched_class;
-       else
-               p->sched_class = &fair_sched_class;
-       set_load_weight(p);
-}
-
-/*
- * check the target process has a UID that matches the current process's
- */
-static bool check_same_owner(struct task_struct *p)
-{
-       const struct cred *cred = current_cred(), *pcred;
-       bool match;
-
-       rcu_read_lock();
-       pcred = __task_cred(p);
-       if (cred->user->user_ns == pcred->user->user_ns)
-               match = (cred->euid == pcred->euid ||
-                        cred->euid == pcred->uid);
-       else
-               match = false;
-       rcu_read_unlock();
-       return match;
-}
-
-static int __sched_setscheduler(struct task_struct *p, int policy,
-                               const struct sched_param *param, bool user)
-{
-       int retval, oldprio, oldpolicy = -1, on_rq, running;
-       unsigned long flags;
-       const struct sched_class *prev_class;
-       struct rq *rq;
-       int reset_on_fork;
-
-       /* may grab non-irq protected spin_locks */
-       BUG_ON(in_interrupt());
-recheck:
-       /* double check policy once rq lock held */
-       if (policy < 0) {
-               reset_on_fork = p->sched_reset_on_fork;
-               policy = oldpolicy = p->policy;
-       } else {
-               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-               policy &= ~SCHED_RESET_ON_FORK;
-
-               if (policy != SCHED_FIFO && policy != SCHED_RR &&
-                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-                               policy != SCHED_IDLE)
-                       return -EINVAL;
-       }
-
-       /*
-        * Valid priorities for SCHED_FIFO and SCHED_RR are
-        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
-        * SCHED_BATCH and SCHED_IDLE is 0.
-        */
-       if (param->sched_priority < 0 ||
-           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
-               return -EINVAL;
-       if (rt_policy(policy) != (param->sched_priority != 0))
-               return -EINVAL;
-
-       /*
-        * Allow unprivileged RT tasks to decrease priority:
-        */
-       if (user && !capable(CAP_SYS_NICE)) {
-               if (rt_policy(policy)) {
-                       unsigned long rlim_rtprio =
-                                       task_rlimit(p, RLIMIT_RTPRIO);
-
-                       /* can't set/change the rt policy */
-                       if (policy != p->policy && !rlim_rtprio)
-                               return -EPERM;
-
-                       /* can't increase priority */
-                       if (param->sched_priority > p->rt_priority &&
-                           param->sched_priority > rlim_rtprio)
-                               return -EPERM;
-               }
-
-               /*
-                * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
-                */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                       if (!can_nice(p, TASK_NICE(p)))
-                               return -EPERM;
-               }
-
-               /* can't change other user's priorities */
-               if (!check_same_owner(p))
-                       return -EPERM;
-
-               /* Normal users shall not reset the sched_reset_on_fork flag */
-               if (p->sched_reset_on_fork && !reset_on_fork)
-                       return -EPERM;
-       }
-
-       if (user) {
-               retval = security_task_setscheduler(p);
-               if (retval)
-                       return retval;
-       }
-
-       /*
-        * make sure no PI-waiters arrive (or leave) while we are
-        * changing the priority of the task:
-        *
-        * To be able to change p->policy safely, the appropriate
-        * runqueue lock must be held.
-        */
-       rq = task_rq_lock(p, &flags);
-
-       /*
-        * Changing the policy of the stop threads its a very bad idea
-        */
-       if (p == rq->stop) {
-               task_rq_unlock(rq, p, &flags);
-               return -EINVAL;
-       }
-
-       /*
-        * If not changing anything there's no need to proceed further:
-        */
-       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
-                       param->sched_priority == p->rt_priority))) {
-
-               __task_rq_unlock(rq);
-               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-               return 0;
-       }
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       if (user) {
-               /*
-                * Do not allow realtime tasks into groups that have no runtime
-                * assigned.
-                */
-               if (rt_bandwidth_enabled() && rt_policy(policy) &&
-                               task_group(p)->rt_bandwidth.rt_runtime == 0 &&
-                               !task_group_is_autogroup(task_group(p))) {
-                       task_rq_unlock(rq, p, &flags);
-                       return -EPERM;
-               }
-       }
-#endif
-
-       /* recheck policy now with rq lock held */
-       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
-               policy = oldpolicy = -1;
-               task_rq_unlock(rq, p, &flags);
-               goto recheck;
-       }
-       on_rq = p->on_rq;
-       running = task_current(rq, p);
-       if (on_rq)
-               deactivate_task(rq, p, 0);
-       if (running)
-               p->sched_class->put_prev_task(rq, p);
-
-       p->sched_reset_on_fork = reset_on_fork;
-
-       oldprio = p->prio;
-       prev_class = p->sched_class;
-       __setscheduler(rq, p, policy, param->sched_priority);
-
-       if (running)
-               p->sched_class->set_curr_task(rq);
-       if (on_rq)
-               activate_task(rq, p, 0);
-
-       check_class_changed(rq, p, prev_class, oldprio);
-       task_rq_unlock(rq, p, &flags);
-
-       rt_mutex_adjust_pi(p);
-
-       return 0;
-}
-
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-                      const struct sched_param *param)
-{
-       return __sched_setscheduler(p, policy, param, true);
-}
-EXPORT_SYMBOL_GPL(sched_setscheduler);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission.  For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              const struct sched_param *param)
-{
-       return __sched_setscheduler(p, policy, param, false);
-}
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
-       struct sched_param lparam;
-       struct task_struct *p;
-       int retval;
-
-       if (!param || pid < 0)
-               return -EINVAL;
-       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
-               return -EFAULT;
-
-       rcu_read_lock();
-       retval = -ESRCH;
-       p = find_process_by_pid(pid);
-       if (p != NULL)
-               retval = sched_setscheduler(p, policy, &lparam);
-       rcu_read_unlock();
-
-       return retval;
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
-               struct sched_param __user *, param)
-{
-       /* negative values for policy are not valid */
-       if (policy < 0)
-               return -EINVAL;
-
-       return do_sched_setscheduler(pid, policy, param);
-}
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
-       return do_sched_setscheduler(pid, -1, param);
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
-       struct task_struct *p;
-       int retval;
-
-       if (pid < 0)
-               return -EINVAL;
-
-       retval = -ESRCH;
-       rcu_read_lock();
-       p = find_process_by_pid(pid);
-       if (p) {
-               retval = security_task_getscheduler(p);
-               if (!retval)
-                       retval = p->policy
-                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
-       }
-       rcu_read_unlock();
-       return retval;
-}
-
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
-       struct sched_param lp;
-       struct task_struct *p;
-       int retval;
-
-       if (!param || pid < 0)
-               return -EINVAL;
-
-       rcu_read_lock();
-       p = find_process_by_pid(pid);
-       retval = -ESRCH;
-       if (!p)
-               goto out_unlock;
-
-       retval = security_task_getscheduler(p);
-       if (retval)
-               goto out_unlock;
-
-       lp.sched_priority = p->rt_priority;
-       rcu_read_unlock();
-
-       /*
-        * This one might sleep, we cannot do it with a spinlock held ...
-        */
-       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-
-       return retval;
-
-out_unlock:
-       rcu_read_unlock();
-       return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
-       cpumask_var_t cpus_allowed, new_mask;
-       struct task_struct *p;
-       int retval;
-
-       get_online_cpus();
-       rcu_read_lock();
-
-       p = find_process_by_pid(pid);
-       if (!p) {
-               rcu_read_unlock();
-               put_online_cpus();
-               return -ESRCH;
-       }
-
-       /* Prevent p going away */
-       get_task_struct(p);
-       rcu_read_unlock();
-
-       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_put_task;
-       }
-       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_free_cpus_allowed;
-       }
-       retval = -EPERM;
-       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
-               goto out_unlock;
-
-       retval = security_task_setscheduler(p);
-       if (retval)
-               goto out_unlock;
-
-       cpuset_cpus_allowed(p, cpus_allowed);
-       cpumask_and(new_mask, in_mask, cpus_allowed);
-again:
-       retval = set_cpus_allowed_ptr(p, new_mask);
-
-       if (!retval) {
-               cpuset_cpus_allowed(p, cpus_allowed);
-               if (!cpumask_subset(new_mask, cpus_allowed)) {
-                       /*
-                        * We must have raced with a concurrent cpuset
-                        * update. Just reset the cpus_allowed to the
-                        * cpuset's cpus_allowed
-                        */
-                       cpumask_copy(new_mask, cpus_allowed);
-                       goto again;
-               }
-       }
-out_unlock:
-       free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-       free_cpumask_var(cpus_allowed);
-out_put_task:
-       put_task_struct(p);
-       put_online_cpus();
-       return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            struct cpumask *new_mask)
-{
-       if (len < cpumask_size())
-               cpumask_clear(new_mask);
-       else if (len > cpumask_size())
-               len = cpumask_size();
-
-       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-/**
- * sys_sched_setaffinity - set the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new cpu mask
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
-               unsigned long __user *, user_mask_ptr)
-{
-       cpumask_var_t new_mask;
-       int retval;
-
-       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
-               return -ENOMEM;
-
-       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
-       if (retval == 0)
-               retval = sched_setaffinity(pid, new_mask);
-       free_cpumask_var(new_mask);
-       return retval;
-}
-
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
-{
-       struct task_struct *p;
-       unsigned long flags;
-       int retval;
-
-       get_online_cpus();
-       rcu_read_lock();
-
-       retval = -ESRCH;
-       p = find_process_by_pid(pid);
-       if (!p)
-               goto out_unlock;
-
-       retval = security_task_getscheduler(p);
-       if (retval)
-               goto out_unlock;
-
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-out_unlock:
-       rcu_read_unlock();
-       put_online_cpus();
-
-       return retval;
-}
-
-/**
- * sys_sched_getaffinity - get the cpu affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current cpu mask
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
-               unsigned long __user *, user_mask_ptr)
-{
-       int ret;
-       cpumask_var_t mask;
-
-       if ((len * BITS_PER_BYTE) < nr_cpu_ids)
-               return -EINVAL;
-       if (len & (sizeof(unsigned long)-1))
-               return -EINVAL;
-
-       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
-               return -ENOMEM;
-
-       ret = sched_getaffinity(pid, mask);
-       if (ret == 0) {
-               size_t retlen = min_t(size_t, len, cpumask_size());
-
-               if (copy_to_user(user_mask_ptr, mask, retlen))
-                       ret = -EFAULT;
-               else
-                       ret = retlen;
-       }
-       free_cpumask_var(mask);
-
-       return ret;
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
-       struct rq *rq = this_rq_lock();
-
-       schedstat_inc(rq, yld_count);
-       current->sched_class->yield_task(rq);
-
-       /*
-        * Since we are going to call schedule() anyway, there's
-        * no need to preempt or enable interrupts:
-        */
-       __release(rq->lock);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-       do_raw_spin_unlock(&rq->lock);
-       preempt_enable_no_resched();
-
-       schedule();
-
-       return 0;
-}
-
-static inline int should_resched(void)
-{
-       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
-static void __cond_resched(void)
-{
-       add_preempt_count(PREEMPT_ACTIVE);
-       __schedule();
-       sub_preempt_count(PREEMPT_ACTIVE);
-}
-
-int __sched _cond_resched(void)
-{
-       if (should_resched()) {
-               __cond_resched();
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL(_cond_resched);
-
-/*
- * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
- * call schedule, and on return reacquire the lock.
- *
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
- * operations here to prevent schedule() from being called twice (once via
- * spin_unlock(), once by hand).
- */
-int __cond_resched_lock(spinlock_t *lock)
-{
-       int resched = should_resched();
-       int ret = 0;
-
-       lockdep_assert_held(lock);
-
-       if (spin_needbreak(lock) || resched) {
-               spin_unlock(lock);
-               if (resched)
-                       __cond_resched();
-               else
-                       cpu_relax();
-               ret = 1;
-               spin_lock(lock);
-       }
-       return ret;
-}
-EXPORT_SYMBOL(__cond_resched_lock);
-
-int __sched __cond_resched_softirq(void)
-{
-       BUG_ON(!in_softirq());
-
-       if (should_resched()) {
-               local_bh_enable();
-               __cond_resched();
-               local_bh_disable();
-               return 1;
-       }
-       return 0;
-}
-EXPORT_SYMBOL(__cond_resched_softirq);
-
-/**
- * yield - yield the current processor to other threads.
- *
- * This is a shortcut for kernel-space yielding - it marks the
- * thread runnable and calls sys_sched_yield().
- */
-void __sched yield(void)
-{
-       set_current_state(TASK_RUNNING);
-       sys_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Returns true if we indeed boosted the target task.
- */
-bool __sched yield_to(struct task_struct *p, bool preempt)
-{
-       struct task_struct *curr = current;
-       struct rq *rq, *p_rq;
-       unsigned long flags;
-       bool yielded = 0;
-
-       local_irq_save(flags);
-       rq = this_rq();
-
-again:
-       p_rq = task_rq(p);
-       double_rq_lock(rq, p_rq);
-       while (task_rq(p) != p_rq) {
-               double_rq_unlock(rq, p_rq);
-               goto again;
-       }
-
-       if (!curr->sched_class->yield_to_task)
-               goto out;
-
-       if (curr->sched_class != p->sched_class)
-               goto out;
-
-       if (task_running(p_rq, p) || p->state)
-               goto out;
-
-       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
-       if (yielded) {
-               schedstat_inc(rq, yld_count);
-               /*
-                * Make p's CPU reschedule; pick_next_entity takes care of
-                * fairness.
-                */
-               if (preempt && rq != p_rq)
-                       resched_task(p_rq->curr);
-       }
-
-out:
-       double_rq_unlock(rq, p_rq);
-       local_irq_restore(flags);
-
-       if (yielded)
-               schedule();
-
-       return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
-
-/*
- * This task is about to go to sleep on IO. Increment rq->nr_iowait so
- * that process accounting knows that this is a task in IO wait state.
- */
-void __sched io_schedule(void)
-{
-       struct rq *rq = raw_rq();
-
-       delayacct_blkio_start();
-       atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
-       schedule();
-       current->in_iowait = 0;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
-long __sched io_schedule_timeout(long timeout)
-{
-       struct rq *rq = raw_rq();
-       long ret;
-
-       delayacct_blkio_start();
-       atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
-       ret = schedule_timeout(timeout);
-       current->in_iowait = 0;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
-       return ret;
-}
-
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * this syscall returns the maximum rt_priority that can be used
- * by a given scheduling class.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
-       int ret = -EINVAL;
-
-       switch (policy) {
-       case SCHED_FIFO:
-       case SCHED_RR:
-               ret = MAX_USER_RT_PRIO-1;
-               break;
-       case SCHED_NORMAL:
-       case SCHED_BATCH:
-       case SCHED_IDLE:
-               ret = 0;
-               break;
-       }
-       return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * this syscall returns the minimum rt_priority that can be used
- * by a given scheduling class.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
-       int ret = -EINVAL;
-
-       switch (policy) {
-       case SCHED_FIFO:
-       case SCHED_RR:
-               ret = 1;
-               break;
-       case SCHED_NORMAL:
-       case SCHED_BATCH:
-       case SCHED_IDLE:
-               ret = 0;
-       }
-       return ret;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-               struct timespec __user *, interval)
-{
-       struct task_struct *p;
-       unsigned int time_slice;
-       unsigned long flags;
-       struct rq *rq;
-       int retval;
-       struct timespec t;
-
-       if (pid < 0)
-               return -EINVAL;
-
-       retval = -ESRCH;
-       rcu_read_lock();
-       p = find_process_by_pid(pid);
-       if (!p)
-               goto out_unlock;
-
-       retval = security_task_getscheduler(p);
-       if (retval)
-               goto out_unlock;
-
-       rq = task_rq_lock(p, &flags);
-       time_slice = p->sched_class->get_rr_interval(rq, p);
-       task_rq_unlock(rq, p, &flags);
-
-       rcu_read_unlock();
-       jiffies_to_timespec(time_slice, &t);
-       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-       return retval;
-
-out_unlock:
-       rcu_read_unlock();
-       return retval;
-}
-
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
-void sched_show_task(struct task_struct *p)
-{
-       unsigned long free = 0;
-       unsigned state;
-
-       state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-15.15s %c", p->comm,
-               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-       if (state == TASK_RUNNING)
-               printk(KERN_CONT " running  ");
-       else
-               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-       if (state == TASK_RUNNING)
-               printk(KERN_CONT "  running task    ");
-       else
-               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
-#ifdef CONFIG_DEBUG_STACK_USAGE
-       free = stack_not_used(p);
-#endif
-       printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent),
-               (unsigned long)task_thread_info(p)->flags);
-
-       show_stack(p, NULL);
-}
-
-void show_state_filter(unsigned long state_filter)
-{
-       struct task_struct *g, *p;
-
-#if BITS_PER_LONG == 32
-       printk(KERN_INFO
-               "  task                PC stack   pid father\n");
-#else
-       printk(KERN_INFO
-               "  task                        PC stack   pid father\n");
-#endif
-       rcu_read_lock();
-       do_each_thread(g, p) {
-               /*
-                * reset the NMI-timeout, listing all files on a slow
-                * console might take a lot of time:
-                */
-               touch_nmi_watchdog();
-               if (!state_filter || (p->state & state_filter))
-                       sched_show_task(p);
-       } while_each_thread(g, p);
-
-       touch_all_softlockup_watchdogs();
-
-#ifdef CONFIG_SCHED_DEBUG
-       sysrq_sched_debug_show();
-#endif
-       rcu_read_unlock();
-       /*
-        * Only show locks if all tasks are dumped:
-        */
-       if (!state_filter)
-               debug_show_all_locks();
-}
-
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
-{
-       idle->sched_class = &idle_sched_class;
-}
-
-/**
- * init_idle - set up an idle thread for a given CPU
- * @idle: task in question
- * @cpu: cpu the idle task belongs to
- *
- * NOTE: this function does not set the idle thread's NEED_RESCHED
- * flag, to make booting more robust.
- */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __sched_fork(idle);
-       idle->state = TASK_RUNNING;
-       idle->se.exec_start = sched_clock();
-
-       do_set_cpus_allowed(idle, cpumask_of(cpu));
-       /*
-        * We're having a chicken and egg problem, even though we are
-        * holding rq->lock, the cpu isn't yet set to this cpu so the
-        * lockdep check in task_group() will fail.
-        *
-        * Similar case to sched_fork(). / Alternatively we could
-        * use task_rq_lock() here and obtain the other rq->lock.
-        *
-        * Silence PROVE_RCU
-        */
-       rcu_read_lock();
-       __set_task_cpu(idle, cpu);
-       rcu_read_unlock();
-
-       rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP)
-       idle->on_cpu = 1;
-#endif
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-       /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
-
-       /*
-        * The idle tasks have their own, simple scheduling class:
-        */
-       idle->sched_class = &idle_sched_class;
-       ftrace_graph_init_idle_task(idle, cpu);
-#if defined(CONFIG_SMP)
-       sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
-}
-
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
-       unsigned int cpus = min_t(int, num_online_cpus(), 8);
-       unsigned int factor;
-
-       switch (sysctl_sched_tunable_scaling) {
-       case SCHED_TUNABLESCALING_NONE:
-               factor = 1;
-               break;
-       case SCHED_TUNABLESCALING_LINEAR:
-               factor = cpus;
-               break;
-       case SCHED_TUNABLESCALING_LOG:
-       default:
-               factor = 1 + ilog2(cpus);
-               break;
-       }
-
-       return factor;
-}
-
-static void update_sysctl(void)
-{
-       unsigned int factor = get_update_sysctl_factor();
-
-#define SET_SYSCTL(name) \
-       (sysctl_##name = (factor) * normalized_sysctl_##name)
-       SET_SYSCTL(sched_min_granularity);
-       SET_SYSCTL(sched_latency);
-       SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-
-static inline void sched_init_granularity(void)
-{
-       update_sysctl();
-}
-
-#ifdef CONFIG_SMP
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-       if (p->sched_class && p->sched_class->set_cpus_allowed)
-               p->sched_class->set_cpus_allowed(p, new_mask);
-
-       cpumask_copy(&p->cpus_allowed, new_mask);
-       p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- *    stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- *    off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- *    it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- *    is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-       unsigned long flags;
-       struct rq *rq;
-       unsigned int dest_cpu;
-       int ret = 0;
-
-       rq = task_rq_lock(p, &flags);
-
-       if (cpumask_equal(&p->cpus_allowed, new_mask))
-               goto out;
-
-       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       do_set_cpus_allowed(p, new_mask);
-
-       /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpumask_test_cpu(task_cpu(p), new_mask))
-               goto out;
-
-       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (p->on_rq) {
-               struct migration_arg arg = { p, dest_cpu };
-               /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &flags);
-               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-               tlb_migrate_finish(p->mm);
-               return 0;
-       }
-out:
-       task_rq_unlock(rq, p, &flags);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-       struct rq *rq_dest, *rq_src;
-       int ret = 0;
-
-       if (unlikely(!cpu_active(dest_cpu)))
-               return ret;
-
-       rq_src = cpu_rq(src_cpu);
-       rq_dest = cpu_rq(dest_cpu);
-
-       raw_spin_lock(&p->pi_lock);
-       double_rq_lock(rq_src, rq_dest);
-       /* Already moved. */
-       if (task_cpu(p) != src_cpu)
-               goto done;
-       /* Affinity changed (again). */
-       if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-               goto fail;
-
-       /*
-        * If we're not on a rq, the next wake-up will ensure we're
-        * placed properly.
-        */
-       if (p->on_rq) {
-               deactivate_task(rq_src, p, 0);
-               set_task_cpu(p, dest_cpu);
-               activate_task(rq_dest, p, 0);
-               check_preempt_curr(rq_dest, p, 0);
-       }
-done:
-       ret = 1;
-fail:
-       double_rq_unlock(rq_src, rq_dest);
-       raw_spin_unlock(&p->pi_lock);
-       return ret;
-}
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
-       struct migration_arg *arg = data;
-
-       /*
-        * The original target cpu might have gone down and we might
-        * be on another cpu but it doesn't matter.
-        */
-       local_irq_disable();
-       __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
-       local_irq_enable();
-       return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
- */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
-       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
-       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
-       rq_src->nr_uninterruptible = 0;
-}
-
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
-}
-
-#ifdef CONFIG_CFS_BANDWIDTH
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
-{
-       struct cfs_rq *cfs_rq;
-
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-               if (!cfs_rq->runtime_enabled)
-                       continue;
-
-               /*
-                * clock_task is not advancing so we just need to make sure
-                * there's some valid quota amount
-                */
-               cfs_rq->runtime_remaining = cfs_b->quota;
-               if (cfs_rq_throttled(cfs_rq))
-                       unthrottle_cfs_rq(cfs_rq);
-       }
-}
-#else
-static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-#endif
-
-/*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
- */
-static void migrate_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next, *stop = rq->stop;
-       int dest_cpu;
-
-       /*
-        * Fudge the rq selection such that the below task selection loop
-        * doesn't get stuck on the currently eligible stop task.
-        *
-        * We're currently inside stop_machine() and the rq is either stuck
-        * in the stop_machine_cpu_stop() loop, or we're executing this code,
-        * either way we should never end up calling schedule() until we're
-        * done here.
-        */
-       rq->stop = NULL;
-
-       /* Ensure any throttled groups are reachable by pick_next_task */
-       unthrottle_offline_cfs_rqs(rq);
-
-       for ( ; ; ) {
-               /*
-                * There's this thread running, bail when that's the only
-                * remaining thread.
-                */
-               if (rq->nr_running == 1)
-                       break;
-
-               next = pick_next_task(rq);
-               BUG_ON(!next);
-               next->sched_class->put_prev_task(rq, next);
-
-               /* Find suitable destination for @next, with force if needed. */
-               dest_cpu = select_fallback_rq(dead_cpu, next);
-               raw_spin_unlock(&rq->lock);
-
-               __migrate_task(next, dead_cpu, dest_cpu);
-
-               raw_spin_lock(&rq->lock);
-       }
-
-       rq->stop = stop;
-}
-
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-       {
-               .procname       = "sched_domain",
-               .mode           = 0555,
-       },
-       {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-       {
-               .procname       = "kernel",
-               .mode           = 0555,
-               .child          = sd_ctl_dir,
-       },
-       {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-       struct ctl_table *entry =
-               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-       return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-       struct ctl_table *entry;
-
-       /*
-        * In the intermediate directories, both the child directory and
-        * procname are dynamically allocated and could fail but the mode
-        * will always be set. In the lowest directory the names are
-        * static strings and all have proc handlers.
-        */
-       for (entry = *tablep; entry->mode; entry++) {
-               if (entry->child)
-                       sd_free_ctl_entry(&entry->child);
-               if (entry->proc_handler == NULL)
-                       kfree(entry->procname);
-       }
-
-       kfree(*tablep);
-       *tablep = NULL;
-}
-
-static void
-set_table_entry(struct ctl_table *entry,
-               const char *procname, void *data, int maxlen,
-               mode_t mode, proc_handler *proc_handler)
-{
-       entry->procname = procname;
-       entry->data = data;
-       entry->maxlen = maxlen;
-       entry->mode = mode;
-       entry->proc_handler = proc_handler;
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-       struct ctl_table *table = sd_alloc_ctl_entry(13);
-
-       if (table == NULL)
-               return NULL;
-
-       set_table_entry(&table[0], "min_interval", &sd->min_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
-       set_table_entry(&table[1], "max_interval", &sd->max_interval,
-               sizeof(long), 0644, proc_doulongvec_minmax);
-       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[9], "cache_nice_tries",
-               &sd->cache_nice_tries,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[10], "flags", &sd->flags,
-               sizeof(int), 0644, proc_dointvec_minmax);
-       set_table_entry(&table[11], "name", sd->name,
-               CORENAME_MAX_SIZE, 0444, proc_dostring);
-       /* &table[12] is terminator */
-
-       return table;
-}
-
-static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-       struct ctl_table *entry, *table;
-       struct sched_domain *sd;
-       int domain_num = 0, i;
-       char buf[32];
-
-       for_each_domain(cpu, sd)
-               domain_num++;
-       entry = table = sd_alloc_ctl_entry(domain_num + 1);
-       if (table == NULL)
-               return NULL;
-
-       i = 0;
-       for_each_domain(cpu, sd) {
-               snprintf(buf, 32, "domain%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_domain_table(sd);
-               entry++;
-               i++;
-       }
-       return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-       int i, cpu_num = num_possible_cpus();
-       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-       char buf[32];
-
-       WARN_ON(sd_ctl_dir[0].child);
-       sd_ctl_dir[0].child = entry;
-
-       if (entry == NULL)
-               return;
-
-       for_each_possible_cpu(i) {
-               snprintf(buf, 32, "cpu%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_cpu_table(i);
-               entry++;
-       }
-
-       WARN_ON(sd_sysctl_header);
-       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-       if (sd_sysctl_header)
-               unregister_sysctl_table(sd_sysctl_header);
-       sd_sysctl_header = NULL;
-       if (sd_ctl_dir[0].child)
-               sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif
-
-static void set_rq_online(struct rq *rq)
-{
-       if (!rq->online) {
-               const struct sched_class *class;
-
-               cpumask_set_cpu(rq->cpu, rq->rd->online);
-               rq->online = 1;
-
-               for_each_class(class) {
-                       if (class->rq_online)
-                               class->rq_online(rq);
-               }
-       }
-}
-
-static void set_rq_offline(struct rq *rq)
-{
-       if (rq->online) {
-               const struct sched_class *class;
-
-               for_each_class(class) {
-                       if (class->rq_offline)
-                               class->rq_offline(rq);
-               }
-
-               cpumask_clear_cpu(rq->cpu, rq->rd->online);
-               rq->online = 0;
-       }
-}
-
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int __cpuinit
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-       int cpu = (long)hcpu;
-       unsigned long flags;
-       struct rq *rq = cpu_rq(cpu);
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-
-       case CPU_UP_PREPARE:
-               rq->calc_load_update = calc_load_update;
-               break;
-
-       case CPU_ONLINE:
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-                       set_rq_online(rq);
-               }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-               break;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DYING:
-               sched_ttwu_pending();
-               /* Update our root-domain */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               if (rq->rd) {
-                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-                       set_rq_offline(rq);
-               }
-               migrate_tasks(cpu);
-               BUG_ON(rq->nr_running != 1); /* the migration thread */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-               migrate_nr_uninterruptible(rq);
-               calc_global_load_remove(rq);
-               break;
-#endif
-       }
-
-       update_max_interval();
-
-       return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block __cpuinitdata migration_notifier = {
-       .notifier_call = migration_call,
-       .priority = CPU_PRI_MIGRATION,
-};
-
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
-                                     unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-               set_cpu_active((long)hcpu, true);
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
-                                       unsigned long action, void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               set_cpu_active((long)hcpu, false);
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int __init migration_init(void)
-{
-       void *cpu = (void *)(long)smp_processor_id();
-       int err;
-
-       /* Initialize migration for the boot CPU */
-       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-       BUG_ON(err == NOTIFY_BAD);
-       migration_call(&migration_notifier, CPU_ONLINE, cpu);
-       register_cpu_notifier(&migration_notifier);
-
-       /* Register cpu active notifiers */
-       cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-       cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-       return 0;
-}
-early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
-
-static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
-
-#ifdef CONFIG_SCHED_DEBUG
-
-static __read_mostly int sched_domain_debug_enabled;
-
-static int __init sched_domain_debug_setup(char *str)
-{
-       sched_domain_debug_enabled = 1;
-
-       return 0;
-}
-early_param("sched_debug", sched_domain_debug_setup);
-
-static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 struct cpumask *groupmask)
-{
-       struct sched_group *group = sd->groups;
-       char str[256];
-
-       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
-       cpumask_clear(groupmask);
-
-       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
-
-       if (!(sd->flags & SD_LOAD_BALANCE)) {
-               printk("does not load-balance\n");
-               if (sd->parent)
-                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                                       " has parent");
-               return -1;
-       }
-
-       printk(KERN_CONT "span %s level %s\n", str, sd->name);
-
-       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-               printk(KERN_ERR "ERROR: domain->span does not contain "
-                               "CPU%d\n", cpu);
-       }
-       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
-               printk(KERN_ERR "ERROR: domain->groups does not contain"
-                               " CPU%d\n", cpu);
-       }
-
-       printk(KERN_DEBUG "%*s groups:", level + 1, "");
-       do {
-               if (!group) {
-                       printk("\n");
-                       printk(KERN_ERR "ERROR: group is NULL\n");
-                       break;
-               }
-
-               if (!group->sgp->power) {
-                       printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: domain->cpu_power not "
-                                       "set\n");
-                       break;
-               }
-
-               if (!cpumask_weight(sched_group_cpus(group))) {
-                       printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: empty group\n");
-                       break;
-               }
-
-               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
-                       printk(KERN_CONT "\n");
-                       printk(KERN_ERR "ERROR: repeated CPUs\n");
-                       break;
-               }
-
-               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
-
-               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
-               printk(KERN_CONT " %s", str);
-               if (group->sgp->power != SCHED_POWER_SCALE) {
-                       printk(KERN_CONT " (cpu_power = %d)",
-                               group->sgp->power);
-               }
-
-               group = group->next;
-       } while (group != sd->groups);
-       printk(KERN_CONT "\n");
-
-       if (!cpumask_equal(sched_domain_span(sd), groupmask))
-               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
-
-       if (sd->parent &&
-           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-               printk(KERN_ERR "ERROR: parent span is not a superset "
-                       "of domain->span\n");
-       return 0;
-}
-
-static void sched_domain_debug(struct sched_domain *sd, int cpu)
-{
-       int level = 0;
-
-       if (!sched_domain_debug_enabled)
-               return;
-
-       if (!sd) {
-               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
-               return;
-       }
-
-       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-
-       for (;;) {
-               if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
-                       break;
-               level++;
-               sd = sd->parent;
-               if (!sd)
-                       break;
-       }
-}
-#else /* !CONFIG_SCHED_DEBUG */
-# define sched_domain_debug(sd, cpu) do { } while (0)
-#endif /* CONFIG_SCHED_DEBUG */
-
-static int sd_degenerate(struct sched_domain *sd)
-{
-       if (cpumask_weight(sched_domain_span(sd)) == 1)
-               return 1;
-
-       /* Following flags need at least 2 groups */
-       if (sd->flags & (SD_LOAD_BALANCE |
-                        SD_BALANCE_NEWIDLE |
-                        SD_BALANCE_FORK |
-                        SD_BALANCE_EXEC |
-                        SD_SHARE_CPUPOWER |
-                        SD_SHARE_PKG_RESOURCES)) {
-               if (sd->groups != sd->groups->next)
-                       return 0;
-       }
-
-       /* Following flags don't use groups */
-       if (sd->flags & (SD_WAKE_AFFINE))
-               return 0;
-
-       return 1;
-}
-
-static int
-sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
-{
-       unsigned long cflags = sd->flags, pflags = parent->flags;
-
-       if (sd_degenerate(parent))
-               return 1;
-
-       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
-               return 0;
-
-       /* Flags needing groups don't count if only 1 group in parent */
-       if (parent->groups == parent->groups->next) {
-               pflags &= ~(SD_LOAD_BALANCE |
-                               SD_BALANCE_NEWIDLE |
-                               SD_BALANCE_FORK |
-                               SD_BALANCE_EXEC |
-                               SD_SHARE_CPUPOWER |
-                               SD_SHARE_PKG_RESOURCES);
-               if (nr_node_ids == 1)
-                       pflags &= ~SD_SERIALIZE;
-       }
-       if (~cflags & pflags)
-               return 0;
-
-       return 1;
-}
-
-static void free_rootdomain(struct rcu_head *rcu)
-{
-       struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
-
-       cpupri_cleanup(&rd->cpupri);
-       free_cpumask_var(rd->rto_mask);
-       free_cpumask_var(rd->online);
-       free_cpumask_var(rd->span);
-       kfree(rd);
-}
-
-static void rq_attach_root(struct rq *rq, struct root_domain *rd)
-{
-       struct root_domain *old_rd = NULL;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       if (rq->rd) {
-               old_rd = rq->rd;
-
-               if (cpumask_test_cpu(rq->cpu, old_rd->online))
-                       set_rq_offline(rq);
-
-               cpumask_clear_cpu(rq->cpu, old_rd->span);
-
-               /*
-                * If we dont want to free the old_rt yet then
-                * set old_rd to NULL to skip the freeing later
-                * in this function:
-                */
-               if (!atomic_dec_and_test(&old_rd->refcount))
-                       old_rd = NULL;
-       }
-
-       atomic_inc(&rd->refcount);
-       rq->rd = rd;
-
-       cpumask_set_cpu(rq->cpu, rd->span);
-       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
-               set_rq_online(rq);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-       if (old_rd)
-               call_rcu_sched(&old_rd->rcu, free_rootdomain);
-}
-
-static int init_rootdomain(struct root_domain *rd)
-{
-       memset(rd, 0, sizeof(*rd));
-
-       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-               goto out;
-       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
-               goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
-               goto free_online;
-
-       if (cpupri_init(&rd->cpupri) != 0)
-               goto free_rto_mask;
-       return 0;
-
-free_rto_mask:
-       free_cpumask_var(rd->rto_mask);
-free_online:
-       free_cpumask_var(rd->online);
-free_span:
-       free_cpumask_var(rd->span);
-out:
-       return -ENOMEM;
-}
-
-static void init_defrootdomain(void)
-{
-       init_rootdomain(&def_root_domain);
-
-       atomic_set(&def_root_domain.refcount, 1);
-}
-
-static struct root_domain *alloc_rootdomain(void)
-{
-       struct root_domain *rd;
-
-       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
-       if (!rd)
-               return NULL;
-
-       if (init_rootdomain(rd) != 0) {
-               kfree(rd);
-               return NULL;
-       }
-
-       return rd;
-}
-
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
-{
-       struct sched_group *tmp, *first;
-
-       if (!sg)
-               return;
-
-       first = sg;
-       do {
-               tmp = sg->next;
-
-               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
-                       kfree(sg->sgp);
-
-               kfree(sg);
-               sg = tmp;
-       } while (sg != first);
-}
-
-static void free_sched_domain(struct rcu_head *rcu)
-{
-       struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-
-       /*
-        * If its an overlapping domain it has private groups, iterate and
-        * nuke them all.
-        */
-       if (sd->flags & SD_OVERLAP) {
-               free_sched_groups(sd->groups, 1);
-       } else if (atomic_dec_and_test(&sd->groups->ref)) {
-               kfree(sd->groups->sgp);
-               kfree(sd->groups);
-       }
-       kfree(sd);
-}
-
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
-{
-       call_rcu(&sd->rcu, free_sched_domain);
-}
-
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
-{
-       for (; sd; sd = sd->parent)
-               destroy_sched_domain(sd, cpu);
-}
-
-/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
- * hold the hotplug lock.
- */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       struct sched_domain *tmp;
-
-       /* Remove the sched domains which do not contribute to scheduling. */
-       for (tmp = sd; tmp; ) {
-               struct sched_domain *parent = tmp->parent;
-               if (!parent)
-                       break;
-
-               if (sd_parent_degenerate(tmp, parent)) {
-                       tmp->parent = parent->parent;
-                       if (parent->parent)
-                               parent->parent->child = tmp;
-                       destroy_sched_domain(parent, cpu);
-               } else
-                       tmp = tmp->parent;
-       }
-
-       if (sd && sd_degenerate(sd)) {
-               tmp = sd;
-               sd = sd->parent;
-               destroy_sched_domain(tmp, cpu);
-               if (sd)
-                       sd->child = NULL;
-       }
-
-       sched_domain_debug(sd, cpu);
-
-       rq_attach_root(rq, rd);
-       tmp = rq->sd;
-       rcu_assign_pointer(rq->sd, sd);
-       destroy_sched_domains(tmp, cpu);
-}
-
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
-/* Setup the mask of cpus configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-       alloc_bootmem_cpumask_var(&cpu_isolated_map);
-       cpulist_parse(str, cpu_isolated_map);
-       return 1;
-}
-
-__setup("isolcpus=", isolated_cpu_setup);
-
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-       int i, n, val, min_val, best_node = -1;
-
-       min_val = INT_MAX;
-
-       for (i = 0; i < nr_node_ids; i++) {
-               /* Start at @node */
-               n = (node + i) % nr_node_ids;
-
-               if (!nr_cpus_node(n))
-                       continue;
-
-               /* Skip already used nodes */
-               if (node_isset(n, *used_nodes))
-                       continue;
-
-               /* Simple min distance search */
-               val = node_distance(node, n);
-
-               if (val < min_val) {
-                       min_val = val;
-                       best_node = n;
-               }
-       }
-
-       if (best_node != -1)
-               node_set(best_node, *used_nodes);
-       return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-       nodemask_t used_nodes;
-       int i;
-
-       cpumask_clear(span);
-       nodes_clear(used_nodes);
-
-       cpumask_or(span, span, cpumask_of_node(node));
-       node_set(node, used_nodes);
-
-       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-               int next_node = find_next_best_node(node, &used_nodes);
-               if (next_node < 0)
-                       break;
-               cpumask_or(span, span, cpumask_of_node(next_node));
-       }
-}
-
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-       lockdep_assert_held(&sched_domains_mutex);
-
-       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-
-       return sched_domains_tmpmask;
-}
-
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-       return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
-
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-       return cpumask_of_node(cpu_to_node(cpu));
-}
-
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-
-struct sd_data {
-       struct sched_domain **__percpu sd;
-       struct sched_group **__percpu sg;
-       struct sched_group_power **__percpu sgp;
-};
-
-struct s_data {
-       struct sched_domain ** __percpu sd;
-       struct root_domain      *rd;
-};
-
-enum s_alloc {
-       sa_rootdomain,
-       sa_sd,
-       sa_sd_storage,
-       sa_none,
-};
-
-struct sched_domain_topology_level;
-
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-
-#define SDTL_OVERLAP   0x01
-
-struct sched_domain_topology_level {
-       sched_domain_init_f init;
-       sched_domain_mask_f mask;
-       int                 flags;
-       struct sd_data      data;
-};
-
-static int
-build_overlap_sched_groups(struct sched_domain *sd, int cpu)
-{
-       struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
-       const struct cpumask *span = sched_domain_span(sd);
-       struct cpumask *covered = sched_domains_tmpmask;
-       struct sd_data *sdd = sd->private;
-       struct sched_domain *child;
-       int i;
-
-       cpumask_clear(covered);
-
-       for_each_cpu(i, span) {
-               struct cpumask *sg_span;
-
-               if (cpumask_test_cpu(i, covered))
-                       continue;
-
-               sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                               GFP_KERNEL, cpu_to_node(i));
-
-               if (!sg)
-                       goto fail;
-
-               sg_span = sched_group_cpus(sg);
-
-               child = *per_cpu_ptr(sdd->sd, i);
-               if (child->child) {
-                       child = child->child;
-                       cpumask_copy(sg_span, sched_domain_span(child));
-               } else
-                       cpumask_set_cpu(i, sg_span);
-
-               cpumask_or(covered, covered, sg_span);
-
-               sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
-               atomic_inc(&sg->sgp->ref);
-
-               if (cpumask_test_cpu(cpu, sg_span))
-                       groups = sg;
-
-               if (!first)
-                       first = sg;
-               if (last)
-                       last->next = sg;
-               last = sg;
-               last->next = first;
-       }
-       sd->groups = groups;
-
-       return 0;
-
-fail:
-       free_sched_groups(first, 0);
-
-       return -ENOMEM;
-}
-
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-{
-       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-       struct sched_domain *child = sd->child;
-
-       if (child)
-               cpu = cpumask_first(sched_domain_span(child));
-
-       if (sg) {
-               *sg = *per_cpu_ptr(sdd->sg, cpu);
-               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
-               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
-       }
-
-       return cpu;
-}
-
-/*
- * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- *
- * Assumes the sched_domain tree is fully constructed
- */
-static int
-build_sched_groups(struct sched_domain *sd, int cpu)
-{
-       struct sched_group *first = NULL, *last = NULL;
-       struct sd_data *sdd = sd->private;
-       const struct cpumask *span = sched_domain_span(sd);
-       struct cpumask *covered;
-       int i;
-
-       get_group(cpu, sdd, &sd->groups);
-       atomic_inc(&sd->groups->ref);
-
-       if (cpu != cpumask_first(sched_domain_span(sd)))
-               return 0;
-
-       lockdep_assert_held(&sched_domains_mutex);
-       covered = sched_domains_tmpmask;
-
-       cpumask_clear(covered);
-
-       for_each_cpu(i, span) {
-               struct sched_group *sg;
-               int group = get_group(i, sdd, &sg);
-               int j;
-
-               if (cpumask_test_cpu(i, covered))
-                       continue;
-
-               cpumask_clear(sched_group_cpus(sg));
-               sg->sgp->power = 0;
-
-               for_each_cpu(j, span) {
-                       if (get_group(j, sdd, NULL) != group)
-                               continue;
-
-                       cpumask_set_cpu(j, covered);
-                       cpumask_set_cpu(j, sched_group_cpus(sg));
-               }
-
-               if (!first)
-                       first = sg;
-               if (last)
-                       last->next = sg;
-               last = sg;
-       }
-       last->next = first;
-
-       return 0;
-}
-
-/*
- * Initialize sched groups cpu_power.
- *
- * cpu_power indicates the capacity of sched group, which is used while
- * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
- */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
-{
-       struct sched_group *sg = sd->groups;
-
-       WARN_ON(!sd || !sg);
-
-       do {
-               sg->group_weight = cpumask_weight(sched_group_cpus(sg));
-               sg = sg->next;
-       } while (sg != sd->groups);
-
-       if (cpu != group_first_cpu(sg))
-               return;
-
-       update_group_power(sd, cpu);
-}
-
-/*
- * Initializers for schedule domains
- * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
- */
-
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)                sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)                do { } while (0)
-#endif
-
-#define SD_INIT_FUNC(type)                                             \
-static noinline struct sched_domain *                                  \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
-{                                                                      \
-       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-       *sd = SD_##type##_INIT;                                         \
-       SD_INIT_NAME(sd, type);                                         \
-       sd->private = &tl->data;                                        \
-       return sd;                                                      \
-}
-
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
-
-static int default_relax_domain_level = -1;
-int sched_domain_level_max;
-
-static int __init setup_relax_domain_level(char *str)
-{
-       unsigned long val;
-
-       val = simple_strtoul(str, NULL, 0);
-       if (val < sched_domain_level_max)
-               default_relax_domain_level = val;
-
-       return 1;
-}
-__setup("relax_domain_level=", setup_relax_domain_level);
-
-static void set_domain_attribute(struct sched_domain *sd,
-                                struct sched_domain_attr *attr)
-{
-       int request;
-
-       if (!attr || attr->relax_domain_level < 0) {
-               if (default_relax_domain_level < 0)
-                       return;
-               else
-                       request = default_relax_domain_level;
-       } else
-               request = attr->relax_domain_level;
-       if (request < sd->level) {
-               /* turn off idle balance on this domain */
-               sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-       } else {
-               /* turn on idle balance on this domain */
-               sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
-       }
-}
-
-static void __sdt_free(const struct cpumask *cpu_map);
-static int __sdt_alloc(const struct cpumask *cpu_map);
-
-static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
-                                const struct cpumask *cpu_map)
-{
-       switch (what) {
-       case sa_rootdomain:
-               if (!atomic_read(&d->rd->refcount))
-                       free_rootdomain(&d->rd->rcu); /* fall through */
-       case sa_sd:
-               free_percpu(d->sd); /* fall through */
-       case sa_sd_storage:
-               __sdt_free(cpu_map); /* fall through */
-       case sa_none:
-               break;
-       }
-}
-
-static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
-                                                  const struct cpumask *cpu_map)
-{
-       memset(d, 0, sizeof(*d));
-
-       if (__sdt_alloc(cpu_map))
-               return sa_sd_storage;
-       d->sd = alloc_percpu(struct sched_domain *);
-       if (!d->sd)
-               return sa_sd_storage;
-       d->rd = alloc_rootdomain();
-       if (!d->rd)
-               return sa_sd;
-       return sa_rootdomain;
-}
-
-/*
- * NULL the sd_data elements we've used to build the sched_domain and
- * sched_group structure so that the subsequent __free_domain_allocs()
- * will not free the data we're using.
- */
-static void claim_allocations(int cpu, struct sched_domain *sd)
-{
-       struct sd_data *sdd = sd->private;
-
-       WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-       *per_cpu_ptr(sdd->sd, cpu) = NULL;
-
-       if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-               *per_cpu_ptr(sdd->sg, cpu) = NULL;
-
-       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
-               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-       return topology_thread_cpumask(cpu);
-}
-#endif
-
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-       { sd_init_BOOK, cpu_book_mask, },
-#endif
-       { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-       { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
-       { NULL, },
-};
-
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-
-static int __sdt_alloc(const struct cpumask *cpu_map)
-{
-       struct sched_domain_topology_level *tl;
-       int j;
-
-       for (tl = sched_domain_topology; tl->init; tl++) {
-               struct sd_data *sdd = &tl->data;
-
-               sdd->sd = alloc_percpu(struct sched_domain *);
-               if (!sdd->sd)
-                       return -ENOMEM;
-
-               sdd->sg = alloc_percpu(struct sched_group *);
-               if (!sdd->sg)
-                       return -ENOMEM;
-
-               sdd->sgp = alloc_percpu(struct sched_group_power *);
-               if (!sdd->sgp)
-                       return -ENOMEM;
-
-               for_each_cpu(j, cpu_map) {
-                       struct sched_domain *sd;
-                       struct sched_group *sg;
-                       struct sched_group_power *sgp;
-
-                       sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
-                                       GFP_KERNEL, cpu_to_node(j));
-                       if (!sd)
-                               return -ENOMEM;
-
-                       *per_cpu_ptr(sdd->sd, j) = sd;
-
-                       sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                       GFP_KERNEL, cpu_to_node(j));
-                       if (!sg)
-                               return -ENOMEM;
-
-                       *per_cpu_ptr(sdd->sg, j) = sg;
-
-                       sgp = kzalloc_node(sizeof(struct sched_group_power),
-                                       GFP_KERNEL, cpu_to_node(j));
-                       if (!sgp)
-                               return -ENOMEM;
-
-                       *per_cpu_ptr(sdd->sgp, j) = sgp;
-               }
-       }
-
-       return 0;
-}
-
-static void __sdt_free(const struct cpumask *cpu_map)
-{
-       struct sched_domain_topology_level *tl;
-       int j;
-
-       for (tl = sched_domain_topology; tl->init; tl++) {
-               struct sd_data *sdd = &tl->data;
-
-               for_each_cpu(j, cpu_map) {
-                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
-                       if (sd && (sd->flags & SD_OVERLAP))
-                               free_sched_groups(sd->groups, 0);
-                       kfree(*per_cpu_ptr(sdd->sd, j));
-                       kfree(*per_cpu_ptr(sdd->sg, j));
-                       kfree(*per_cpu_ptr(sdd->sgp, j));
-               }
-               free_percpu(sdd->sd);
-               free_percpu(sdd->sg);
-               free_percpu(sdd->sgp);
-       }
-}
-
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
-               struct s_data *d, const struct cpumask *cpu_map,
-               struct sched_domain_attr *attr, struct sched_domain *child,
-               int cpu)
-{
-       struct sched_domain *sd = tl->init(tl, cpu);
-       if (!sd)
-               return child;
-
-       set_domain_attribute(sd, attr);
-       cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
-       if (child) {
-               sd->level = child->level + 1;
-               sched_domain_level_max = max(sched_domain_level_max, sd->level);
-               child->parent = sd;
-       }
-       sd->child = child;
-
-       return sd;
-}
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-static int build_sched_domains(const struct cpumask *cpu_map,
-                              struct sched_domain_attr *attr)
-{
-       enum s_alloc alloc_state = sa_none;
-       struct sched_domain *sd;
-       struct s_data d;
-       int i, ret = -ENOMEM;
-
-       alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
-       if (alloc_state != sa_rootdomain)
-               goto error;
-
-       /* Set up domains for cpus specified by the cpu_map. */
-       for_each_cpu(i, cpu_map) {
-               struct sched_domain_topology_level *tl;
-
-               sd = NULL;
-               for (tl = sched_domain_topology; tl->init; tl++) {
-                       sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-                       if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
-                               sd->flags |= SD_OVERLAP;
-                       if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-                               break;
-               }
-
-               while (sd->child)
-                       sd = sd->child;
-
-               *per_cpu_ptr(d.sd, i) = sd;
-       }
-
-       /* Build the groups for the domains */
-       for_each_cpu(i, cpu_map) {
-               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                       sd->span_weight = cpumask_weight(sched_domain_span(sd));
-                       if (sd->flags & SD_OVERLAP) {
-                               if (build_overlap_sched_groups(sd, i))
-                                       goto error;
-                       } else {
-                               if (build_sched_groups(sd, i))
-                                       goto error;
-                       }
-               }
-       }
-
-       /* Calculate CPU power for physical packages and nodes */
-       for (i = nr_cpumask_bits-1; i >= 0; i--) {
-               if (!cpumask_test_cpu(i, cpu_map))
-                       continue;
-
-               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                       claim_allocations(i, sd);
-                       init_sched_groups_power(i, sd);
-               }
-       }
-
-       /* Attach the domains */
-       rcu_read_lock();
-       for_each_cpu(i, cpu_map) {
-               sd = *per_cpu_ptr(d.sd, i);
-               cpu_attach_domain(sd, d.rd, i);
-       }
-       rcu_read_unlock();
-
-       ret = 0;
-error:
-       __free_domain_allocs(&d, alloc_state, cpu_map);
-       return ret;
-}
-
-static cpumask_var_t *doms_cur;        /* current sched domains */
-static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur;
-                               /* attribues of custom domains in 'doms_cur' */
-
-/*
- * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask) fails, then fallback to a single sched domain,
- * as determined by the single cpumask fallback_doms.
- */
-static cpumask_var_t fallback_doms;
-
-/*
- * arch_update_cpu_topology lets virtualized architectures update the
- * cpu core maps. It is supposed to return 1 if the topology changed
- * or 0 if it stayed the same.
- */
-int __attribute__((weak)) arch_update_cpu_topology(void)
-{
-       return 0;
-}
-
-cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
-{
-       int i;
-       cpumask_var_t *doms;
-
-       doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
-       if (!doms)
-               return NULL;
-       for (i = 0; i < ndoms; i++) {
-               if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
-                       free_sched_domains(doms, i);
-                       return NULL;
-               }
-       }
-       return doms;
-}
-
-void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
-{
-       unsigned int i;
-       for (i = 0; i < ndoms; i++)
-               free_cpumask_var(doms[i]);
-       kfree(doms);
-}
-
-/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
-static int init_sched_domains(const struct cpumask *cpu_map)
-{
-       int err;
-
-       arch_update_cpu_topology();
-       ndoms_cur = 1;
-       doms_cur = alloc_sched_domains(ndoms_cur);
-       if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-       dattr_cur = NULL;
-       err = build_sched_domains(doms_cur[0], NULL);
-       register_sched_domain_sysctl();
-
-       return err;
-}
-
-/*
- * Detach sched domains from a group of cpus specified in cpu_map
- * These cpus will now be attached to the NULL domain
- */
-static void detach_destroy_domains(const struct cpumask *cpu_map)
-{
-       int i;
-
-       rcu_read_lock();
-       for_each_cpu(i, cpu_map)
-               cpu_attach_domain(NULL, &def_root_domain, i);
-       rcu_read_unlock();
-}
-
-/* handle null as "default" */
-static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
-                       struct sched_domain_attr *new, int idx_new)
-{
-       struct sched_domain_attr tmp;
-
-       /* fast path */
-       if (!new && !cur)
-               return 1;
-
-       tmp = SD_ATTR_INIT;
-       return !memcmp(cur ? (cur + idx_cur) : &tmp,
-                       new ? (new + idx_new) : &tmp,
-                       sizeof(struct sched_domain_attr));
-}
-
-/*
- * Partition sched domains as specified by the 'ndoms_new'
- * cpumasks in the array doms_new[] of cpumasks. This compares
- * doms_new[] to the current sched domain partitioning, doms_cur[].
- * It destroys each deleted domain and builds each new domain.
- *
- * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
- * The masks don't intersect (don't overlap.) We should setup one
- * sched domain for each mask. CPUs not in any of the cpumasks will
- * not be load balanced. If the same cpumask appears both in the
- * current 'doms_cur' domains and in the new 'doms_new', we can leave
- * it as it is.
- *
- * The passed in 'doms_new' should be allocated using
- * alloc_sched_domains.  This routine takes ownership of it and will
- * free_sched_domains it when done with it. If the caller failed the
- * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
- *
- * If doms_new == NULL it will be replaced with cpu_online_mask.
- * ndoms_new == 0 is a special case for destroying existing domains,
- * and it will not create the default domain.
- *
- * Call with hotplug lock held
- */
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-                            struct sched_domain_attr *dattr_new)
-{
-       int i, j, n;
-       int new_topology;
-
-       mutex_lock(&sched_domains_mutex);
-
-       /* always unregister in case we don't destroy any domains */
-       unregister_sched_domain_sysctl();
-
-       /* Let architecture update cpu core mappings. */
-       new_topology = arch_update_cpu_topology();
-
-       n = doms_new ? ndoms_new : 0;
-
-       /* Destroy deleted domains */
-       for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n && !new_topology; j++) {
-                       if (cpumask_equal(doms_cur[i], doms_new[j])
-                           && dattrs_equal(dattr_cur, i, dattr_new, j))
-                               goto match1;
-               }
-               /* no match - a current sched domain not in new doms_new[] */
-               detach_destroy_domains(doms_cur[i]);
-match1:
-               ;
-       }
-
-       if (doms_new == NULL) {
-               ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
-               WARN_ON_ONCE(dattr_new);
-       }
-
-       /* Build new domains */
-       for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                       if (cpumask_equal(doms_new[i], doms_cur[j])
-                           && dattrs_equal(dattr_new, i, dattr_cur, j))
-                               goto match2;
-               }
-               /* no match - add a new doms_new */
-               build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-match2:
-               ;
-       }
-
-       /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
-               free_sched_domains(doms_cur, ndoms_cur);
-       kfree(dattr_cur);       /* kfree(NULL) is safe */
-       doms_cur = doms_new;
-       dattr_cur = dattr_new;
-       ndoms_cur = ndoms_new;
-
-       register_sched_domain_sysctl();
-
-       mutex_unlock(&sched_domains_mutex);
-}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-       get_online_cpus();
-
-       /* Destroy domains first to force the rebuild */
-       partition_sched_domains(0, NULL, NULL);
-
-       rebuild_sched_domains();
-       put_online_cpus();
-}
-
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-       unsigned int level = 0;
-
-       if (sscanf(buf, "%u", &level) != 1)
-               return -EINVAL;
-
-       /*
-        * level is always be positive so don't check for
-        * level < POWERSAVINGS_BALANCE_NONE which is 0
-        * What happens on 0 or 1 byte write,
-        * need to check for count as well?
-        */
-
-       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-               return -EINVAL;
-
-       if (smt)
-               sched_smt_power_savings = level;
-       else
-               sched_mc_power_savings = level;
-
-       reinit_sched_domains();
-
-       return count;
-}
-
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
-                                          struct sysdev_class_attribute *attr,
-                                          char *page)
-{
-       return sprintf(page, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
-                                           struct sysdev_class_attribute *attr,
-                                           const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 0);
-}
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
-                        sched_mc_power_savings_show,
-                        sched_mc_power_savings_store);
-#endif
-
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
-                                           struct sysdev_class_attribute *attr,
-                                           char *page)
-{
-       return sprintf(page, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
-                                            struct sysdev_class_attribute *attr,
-                                            const char *buf, size_t count)
-{
-       return sched_power_savings_store(buf, count, 1);
-}
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
-                  sched_smt_power_savings_show,
-                  sched_smt_power_savings_store);
-#endif
-
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
-{
-       int err = 0;
-
-#ifdef CONFIG_SCHED_SMT
-       if (smt_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_smt_power_savings.attr);
-#endif
-#ifdef CONFIG_SCHED_MC
-       if (!err && mc_capable())
-               err = sysfs_create_file(&cls->kset.kobj,
-                                       &attr_sched_mc_power_savings.attr);
-#endif
-       return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-/*
- * Update cpusets according to cpu_active mask.  If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                            void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_ONLINE:
-       case CPU_DOWN_FAILED:
-               cpuset_update_active_cpus();
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-                              void *hcpu)
-{
-       switch (action & ~CPU_TASKS_FROZEN) {
-       case CPU_DOWN_PREPARE:
-               cpuset_update_active_cpus();
-               return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-static int update_runtime(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
-{
-       int cpu = (int)(long)hcpu;
-
-       switch (action) {
-       case CPU_DOWN_PREPARE:
-       case CPU_DOWN_PREPARE_FROZEN:
-               disable_runtime(cpu_rq(cpu));
-               return NOTIFY_OK;
-
-       case CPU_DOWN_FAILED:
-       case CPU_DOWN_FAILED_FROZEN:
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               enable_runtime(cpu_rq(cpu));
-               return NOTIFY_OK;
-
-       default:
-               return NOTIFY_DONE;
-       }
-}
-
-void __init sched_init_smp(void)
-{
-       cpumask_var_t non_isolated_cpus;
-
-       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-
-       get_online_cpus();
-       mutex_lock(&sched_domains_mutex);
-       init_sched_domains(cpu_active_mask);
-       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-       if (cpumask_empty(non_isolated_cpus))
-               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
-       mutex_unlock(&sched_domains_mutex);
-       put_online_cpus();
-
-       hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-       hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
-       /* RT runtime code needs to handle some hotplug events */
-       hotcpu_notifier(update_runtime, 0);
-
-       init_hrtick();
-
-       /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
-               BUG();
-       sched_init_granularity();
-       free_cpumask_var(non_isolated_cpus);
-
-       init_sched_rt_class();
-}
-#else
-void __init sched_init_smp(void)
-{
-       sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-
-const_debug unsigned int sysctl_timer_migration = 1;
-
-int in_sched_functions(unsigned long addr)
-{
-       return in_lock_functions(addr) ||
-               (addr >= (unsigned long)__sched_text_start
-               && addr < (unsigned long)__sched_text_end);
-}
-
-static void init_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       cfs_rq->tasks_timeline = RB_ROOT;
-       INIT_LIST_HEAD(&cfs_rq->tasks);
-       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
-       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-
-static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
-{
-       struct rt_prio_array *array;
-       int i;
-
-       array = &rt_rq->active;
-       for (i = 0; i < MAX_RT_PRIO; i++) {
-               INIT_LIST_HEAD(array->queue + i);
-               __clear_bit(i, array->bitmap);
-       }
-       /* delimiter for bitsearch: */
-       __set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
-       rt_rq->highest_prio.curr = MAX_RT_PRIO;
-       rt_rq->highest_prio.next = MAX_RT_PRIO;
-       rt_rq->rt_nr_migratory = 0;
-       rt_rq->overloaded = 0;
-       plist_head_init(&rt_rq->pushable_tasks);
-#endif
-
-       rt_rq->rt_time = 0;
-       rt_rq->rt_throttled = 0;
-       rt_rq->rt_runtime = 0;
-       raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu,
-                               struct sched_entity *parent)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       cfs_rq->tg = tg;
-       cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-       /* allow initial update_cfs_load() to truncate */
-       cfs_rq->load_stamp = 1;
-#endif
-       init_cfs_rq_runtime(cfs_rq);
-
-       tg->cfs_rq[cpu] = cfs_rq;
-       tg->se[cpu] = se;
-
-       /* se could be NULL for root_task_group */
-       if (!se)
-               return;
-
-       if (!parent)
-               se->cfs_rq = &rq->cfs;
-       else
-               se->cfs_rq = parent->my_q;
-
-       se->my_q = cfs_rq;
-       update_load_set(&se->load, 0);
-       se->parent = parent;
-}
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu,
-               struct sched_rt_entity *parent)
-{
-       struct rq *rq = cpu_rq(cpu);
-
-       rt_rq->highest_prio.curr = MAX_RT_PRIO;
-       rt_rq->rt_nr_boosted = 0;
-       rt_rq->rq = rq;
-       rt_rq->tg = tg;
-
-       tg->rt_rq[cpu] = rt_rq;
-       tg->rt_se[cpu] = rt_se;
-
-       if (!rt_se)
-               return;
-
-       if (!parent)
-               rt_se->rt_rq = &rq->rt;
-       else
-               rt_se->rt_rq = parent->my_q;
-
-       rt_se->my_q = rt_rq;
-       rt_se->parent = parent;
-       INIT_LIST_HEAD(&rt_se->run_list);
-}
-#endif
-
-void __init sched_init(void)
-{
-       int i, j;
-       unsigned long alloc_size = 0, ptr;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
-#endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-       alloc_size += num_possible_cpus() * cpumask_size();
-#endif
-       if (alloc_size) {
-               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.se = (struct sched_entity **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
-               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-               root_task_group.rt_rq = (struct rt_rq **)ptr;
-               ptr += nr_cpu_ids * sizeof(void **);
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_CPUMASK_OFFSTACK
-               for_each_possible_cpu(i) {
-                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
-                       ptr += cpumask_size();
-               }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
-       }
-
-#ifdef CONFIG_SMP
-       init_defrootdomain();
-#endif
-
-       init_rt_bandwidth(&def_rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       init_rt_bandwidth(&root_task_group.rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_CGROUP_SCHED
-       list_add(&root_task_group.list, &task_groups);
-       INIT_LIST_HEAD(&root_task_group.children);
-       autogroup_init(&init_task);
-#endif /* CONFIG_CGROUP_SCHED */
-
-       for_each_possible_cpu(i) {
-               struct rq *rq;
-
-               rq = cpu_rq(i);
-               raw_spin_lock_init(&rq->lock);
-               rq->nr_running = 0;
-               rq->calc_load_active = 0;
-               rq->calc_load_update = jiffies + LOAD_FREQ;
-               init_cfs_rq(&rq->cfs);
-               init_rt_rq(&rq->rt, rq);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-               root_task_group.shares = root_task_group_load;
-               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-               /*
-                * How much cpu bandwidth does root_task_group get?
-                *
-                * In case of task-groups formed thr' the cgroup filesystem, it
-                * gets 100% of the cpu resources in the system. This overall
-                * system cpu resource is divided among the tasks of
-                * root_task_group and its child task-groups in a fair manner,
-                * based on each entity's (task or task-group's) weight
-                * (se->load.weight).
-                *
-                * In other words, if root_task_group has 10 tasks of weight
-                * 1024) and two child groups A0 and A1 (of weight 1024 each),
-                * then A0's share of the cpu resource is:
-                *
-                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
-                *
-                * We achieve this by letting root_task_group's tasks sit
-                * directly in rq->cfs (i.e root_task_group->se[] = NULL).
-                */
-               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
-               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
-#ifdef CONFIG_RT_GROUP_SCHED
-               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
-#endif
-
-               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
-                       rq->cpu_load[j] = 0;
-
-               rq->last_load_update_tick = jiffies;
-
-#ifdef CONFIG_SMP
-               rq->sd = NULL;
-               rq->rd = NULL;
-               rq->cpu_power = SCHED_POWER_SCALE;
-               rq->post_schedule = 0;
-               rq->active_balance = 0;
-               rq->next_balance = jiffies;
-               rq->push_cpu = 0;
-               rq->cpu = i;
-               rq->online = 0;
-               rq->idle_stamp = 0;
-               rq->avg_idle = 2*sysctl_sched_migration_cost;
-               rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
-               rq->nohz_balance_kick = 0;
-#endif
-#endif
-               init_rq_hrtick(rq);
-               atomic_set(&rq->nr_iowait, 0);
-       }
-
-       set_load_weight(&init_task);
-
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
-
-#ifdef CONFIG_SMP
-       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#endif
-
-#ifdef CONFIG_RT_MUTEXES
-       plist_head_init(&init_task.pi_waiters);
-#endif
-
-       /*
-        * The boot idle thread does lazy MMU switching as well:
-        */
-       atomic_inc(&init_mm.mm_count);
-       enter_lazy_tlb(&init_mm, current);
-
-       /*
-        * Make us the idle thread. Technically, schedule() should not be
-        * called from this thread, however somewhere below it might be,
-        * but because we are the idle thread, we just pick up running again
-        * when this runqueue becomes "idle".
-        */
-       init_idle(current, smp_processor_id());
-
-       calc_load_update = jiffies + LOAD_FREQ;
-
-       /*
-        * During early bootup we pretend to be a normal task:
-        */
-       current->sched_class = &fair_sched_class;
-
-#ifdef CONFIG_SMP
-       zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-#ifdef CONFIG_NO_HZ
-       zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-       alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
-       atomic_set(&nohz.load_balancer, nr_cpu_ids);
-       atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
-       atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
-#endif
-       /* May be allocated at isolcpus cmdline parse time */
-       if (cpu_isolated_map == NULL)
-               zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-#endif /* SMP */
-
-       scheduler_running = 1;
-}
-
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline int preempt_count_equals(int preempt_offset)
-{
-       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-
-       return (nested == preempt_offset);
-}
-
-void __might_sleep(const char *file, int line, int preempt_offset)
-{
-       static unsigned long prev_jiffy;        /* ratelimiting */
-
-       rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-           system_state != SYSTEM_RUNNING || oops_in_progress)
-               return;
-       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-               return;
-       prev_jiffy = jiffies;
-
-       printk(KERN_ERR
-               "BUG: sleeping function called from invalid context at %s:%d\n",
-                       file, line);
-       printk(KERN_ERR
-               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
-                       in_atomic(), irqs_disabled(),
-                       current->pid, current->comm);
-
-       debug_show_held_locks(current);
-       if (irqs_disabled())
-               print_irqtrace_events(current);
-       dump_stack();
-}
-EXPORT_SYMBOL(__might_sleep);
-#endif
-
-#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
-{
-       const struct sched_class *prev_class = p->sched_class;
-       int old_prio = p->prio;
-       int on_rq;
-
-       on_rq = p->on_rq;
-       if (on_rq)
-               deactivate_task(rq, p, 0);
-       __setscheduler(rq, p, SCHED_NORMAL, 0);
-       if (on_rq) {
-               activate_task(rq, p, 0);
-               resched_task(rq->curr);
-       }
-
-       check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-       struct task_struct *g, *p;
-       unsigned long flags;
-       struct rq *rq;
-
-       read_lock_irqsave(&tasklist_lock, flags);
-       do_each_thread(g, p) {
-               /*
-                * Only normalize user tasks:
-                */
-               if (!p->mm)
-                       continue;
-
-               p->se.exec_start                = 0;
-#ifdef CONFIG_SCHEDSTATS
-               p->se.statistics.wait_start     = 0;
-               p->se.statistics.sleep_start    = 0;
-               p->se.statistics.block_start    = 0;
-#endif
-
-               if (!rt_task(p)) {
-                       /*
-                        * Renice negative nice level userspace
-                        * tasks back to 0:
-                        */
-                       if (TASK_NICE(p) < 0 && p->mm)
-                               set_user_nice(p, 0);
-                       continue;
-               }
-
-               raw_spin_lock(&p->pi_lock);
-               rq = __task_rq_lock(p);
-
-               normalize_task(rq, p);
-
-               __task_rq_unlock(rq);
-               raw_spin_unlock(&p->pi_lock);
-       } while_each_thread(g, p);
-
-       read_unlock_irqrestore(&tasklist_lock, flags);
-}
-
-#endif /* CONFIG_MAGIC_SYSRQ */
-
-#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
-/*
- * These functions are only useful for the IA64 MCA handling, or kdb.
- *
- * They can only be called when the whole system has been
- * stopped - every CPU needs to be quiescent, and no scheduling
- * activity can take place. Using them for anything else would
- * be a serious bug, and as a result, they aren't even visible
- * under any other configuration.
- */
-
-/**
- * curr_task - return the current task for a given cpu.
- * @cpu: the processor in question.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-struct task_struct *curr_task(int cpu)
-{
-       return cpu_curr(cpu);
-}
-
-#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
-
-#ifdef CONFIG_IA64
-/**
- * set_curr_task - set the current task for a given cpu.
- * @cpu: the processor in question.
- * @p: the task pointer to set.
- *
- * Description: This function must only be used when non-maskable interrupts
- * are serviced on a separate stack. It allows the architecture to switch the
- * notion of the current task on a cpu in a non-blocking manner. This function
- * must be called with all CPU's synchronized, and interrupts disabled, the
- * and caller must save the original value of the current task (see
- * curr_task() above) and restore that value before reenabling interrupts and
- * re-starting the system.
- *
- * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
- */
-void set_curr_task(int cpu, struct task_struct *p)
-{
-       cpu_curr(cpu) = p;
-}
-
-#endif
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void free_fair_sched_group(struct task_group *tg)
-{
-       int i;
-
-       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
-       for_each_possible_cpu(i) {
-               if (tg->cfs_rq)
-                       kfree(tg->cfs_rq[i]);
-               if (tg->se)
-                       kfree(tg->se[i]);
-       }
-
-       kfree(tg->cfs_rq);
-       kfree(tg->se);
-}
-
-static
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-       struct cfs_rq *cfs_rq;
-       struct sched_entity *se;
-       int i;
-
-       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
-       if (!tg->cfs_rq)
-               goto err;
-       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
-       if (!tg->se)
-               goto err;
-
-       tg->shares = NICE_0_LOAD;
-
-       init_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
-       for_each_possible_cpu(i) {
-               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                     GFP_KERNEL, cpu_to_node(i));
-               if (!cfs_rq)
-                       goto err;
-
-               se = kzalloc_node(sizeof(struct sched_entity),
-                                 GFP_KERNEL, cpu_to_node(i));
-               if (!se)
-                       goto err_free_rq;
-
-               init_cfs_rq(cfs_rq);
-               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
-       }
-
-       return 1;
-
-err_free_rq:
-       kfree(cfs_rq);
-err:
-       return 0;
-}
-
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
-       /*
-       * Only empty task groups can be destroyed; so we can speculatively
-       * check on_list without danger of it being re-added.
-       */
-       if (!tg->cfs_rq[cpu]->on_list)
-               return;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void free_fair_sched_group(struct task_group *tg)
-{
-}
-
-static inline
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-       return 1;
-}
-
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg)
-{
-       int i;
-
-       if (tg->rt_se)
-               destroy_rt_bandwidth(&tg->rt_bandwidth);
-
-       for_each_possible_cpu(i) {
-               if (tg->rt_rq)
-                       kfree(tg->rt_rq[i]);
-               if (tg->rt_se)
-                       kfree(tg->rt_se[i]);
-       }
-
-       kfree(tg->rt_rq);
-       kfree(tg->rt_se);
-}
-
-static
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-       struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se;
-       int i;
-
-       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
-       if (!tg->rt_rq)
-               goto err;
-       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-       if (!tg->rt_se)
-               goto err;
-
-       init_rt_bandwidth(&tg->rt_bandwidth,
-                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-
-       for_each_possible_cpu(i) {
-               rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                    GFP_KERNEL, cpu_to_node(i));
-               if (!rt_rq)
-                       goto err;
-
-               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                    GFP_KERNEL, cpu_to_node(i));
-               if (!rt_se)
-                       goto err_free_rq;
-
-               init_rt_rq(rt_rq, cpu_rq(i));
-               rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
-       }
-
-       return 1;
-
-err_free_rq:
-       kfree(rt_rq);
-err:
-       return 0;
-}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static inline void free_rt_sched_group(struct task_group *tg)
-{
-}
-
-static inline
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-       return 1;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_CGROUP_SCHED
-static void free_sched_group(struct task_group *tg)
-{
-       free_fair_sched_group(tg);
-       free_rt_sched_group(tg);
-       autogroup_free(tg);
-       kfree(tg);
-}
-
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(struct task_group *parent)
-{
-       struct task_group *tg;
-       unsigned long flags;
-
-       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-       if (!tg)
-               return ERR_PTR(-ENOMEM);
-
-       if (!alloc_fair_sched_group(tg, parent))
-               goto err;
-
-       if (!alloc_rt_sched_group(tg, parent))
-               goto err;
-
-       spin_lock_irqsave(&task_group_lock, flags);
-       list_add_rcu(&tg->list, &task_groups);
-
-       WARN_ON(!parent); /* root should already exist */
-
-       tg->parent = parent;
-       INIT_LIST_HEAD(&tg->children);
-       list_add_rcu(&tg->siblings, &parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       return tg;
-
-err:
-       free_sched_group(tg);
-       return ERR_PTR(-ENOMEM);
-}
-
-/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
-{
-       /* now it should be safe to free those cfs_rqs */
-       free_sched_group(container_of(rhp, struct task_group, rcu));
-}
-
-/* Destroy runqueue etc associated with a task group */
-void sched_destroy_group(struct task_group *tg)
-{
-       unsigned long flags;
-       int i;
-
-       /* end participation in shares distribution */
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-
-       spin_lock_irqsave(&task_group_lock, flags);
-       list_del_rcu(&tg->list);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for possible concurrent references to cfs_rqs complete */
-       call_rcu(&tg->rcu, free_sched_group_rcu);
-}
-
-/* change task's runqueue when it moves between groups.
- *     The caller of this function should have put the task in its new group
- *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- *     reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
-{
-       int on_rq, running;
-       unsigned long flags;
-       struct rq *rq;
-
-       rq = task_rq_lock(tsk, &flags);
-
-       running = task_current(rq, tsk);
-       on_rq = tsk->on_rq;
-
-       if (on_rq)
-               dequeue_task(rq, tsk, 0);
-       if (unlikely(running))
-               tsk->sched_class->put_prev_task(rq, tsk);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (tsk->sched_class->task_move_group)
-               tsk->sched_class->task_move_group(tsk, on_rq);
-       else
-#endif
-               set_task_rq(tsk, task_cpu(tsk));
-
-       if (unlikely(running))
-               tsk->sched_class->set_curr_task(rq);
-       if (on_rq)
-               enqueue_task(rq, tsk, 0);
-
-       task_rq_unlock(rq, tsk, &flags);
-}
-#endif /* CONFIG_CGROUP_SCHED */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static DEFINE_MUTEX(shares_mutex);
-
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-{
-       int i;
-       unsigned long flags;
-
-       /*
-        * We can't change the weight of the root cgroup.
-        */
-       if (!tg->se[0])
-               return -EINVAL;
-
-       shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-
-       mutex_lock(&shares_mutex);
-       if (tg->shares == shares)
-               goto done;
-
-       tg->shares = shares;
-       for_each_possible_cpu(i) {
-               struct rq *rq = cpu_rq(i);
-               struct sched_entity *se;
-
-               se = tg->se[i];
-               /* Propagate contribution to hierarchy */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               for_each_sched_entity(se)
-                       update_cfs_shares(group_cfs_rq(se));
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-
-done:
-       mutex_unlock(&shares_mutex);
-       return 0;
-}
-
-unsigned long sched_group_shares(struct task_group *tg)
-{
-       return tg->shares;
-}
-#endif
-
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-       if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
-
-       return div64_u64(runtime << 20, period);
-}
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
-{
-       struct task_struct *g, *p;
-
-       do_each_thread(g, p) {
-               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                       return 1;
-       } while_each_thread(g, p);
-
-       return 0;
-}
-
-struct rt_schedulable_data {
-       struct task_group *tg;
-       u64 rt_period;
-       u64 rt_runtime;
-};
-
-static int tg_rt_schedulable(struct task_group *tg, void *data)
-{
-       struct rt_schedulable_data *d = data;
-       struct task_group *child;
-       unsigned long total, sum = 0;
-       u64 period, runtime;
-
-       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       runtime = tg->rt_bandwidth.rt_runtime;
-
-       if (tg == d->tg) {
-               period = d->rt_period;
-               runtime = d->rt_runtime;
-       }
-
-       /*
-        * Cannot have more runtime than the period.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
-
-       /*
-        * Ensure we don't starve existing RT tasks.
-        */
-       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
-               return -EBUSY;
-
-       total = to_ratio(period, runtime);
-
-       /*
-        * Nobody can have more than the global setting allows.
-        */
-       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
-               return -EINVAL;
-
-       /*
-        * The sum of our children's runtime should not exceed our own.
-        */
-       list_for_each_entry_rcu(child, &tg->children, siblings) {
-               period = ktime_to_ns(child->rt_bandwidth.rt_period);
-               runtime = child->rt_bandwidth.rt_runtime;
-
-               if (child == d->tg) {
-                       period = d->rt_period;
-                       runtime = d->rt_runtime;
-               }
-
-               sum += to_ratio(period, runtime);
-       }
-
-       if (sum > total)
-               return -EINVAL;
-
-       return 0;
-}
-
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-       int ret;
-
-       struct rt_schedulable_data data = {
-               .tg = tg,
-               .rt_period = period,
-               .rt_runtime = runtime,
-       };
-
-       rcu_read_lock();
-       ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int tg_set_rt_bandwidth(struct task_group *tg,
-               u64 rt_period, u64 rt_runtime)
-{
-       int i, err = 0;
-
-       mutex_lock(&rt_constraints_mutex);
-       read_lock(&tasklist_lock);
-       err = __rt_schedulable(tg, rt_period, rt_runtime);
-       if (err)
-               goto unlock;
-
-       raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-       tg->rt_bandwidth.rt_runtime = rt_runtime;
-
-       for_each_possible_cpu(i) {
-               struct rt_rq *rt_rq = tg->rt_rq[i];
-
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               rt_rq->rt_runtime = rt_runtime;
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-       }
-       raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
-       read_unlock(&tasklist_lock);
-       mutex_unlock(&rt_constraints_mutex);
-
-       return err;
-}
-
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
-{
-       u64 rt_runtime, rt_period;
-
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
-       if (rt_runtime_us < 0)
-               rt_runtime = RUNTIME_INF;
-
-       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-long sched_group_rt_runtime(struct task_group *tg)
-{
-       u64 rt_runtime_us;
-
-       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
-               return -1;
-
-       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
-       do_div(rt_runtime_us, NSEC_PER_USEC);
-       return rt_runtime_us;
-}
-
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
-{
-       u64 rt_runtime, rt_period;
-
-       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
-
-       if (rt_period == 0)
-               return -EINVAL;
-
-       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-long sched_group_rt_period(struct task_group *tg)
-{
-       u64 rt_period_us;
-
-       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       do_div(rt_period_us, NSEC_PER_USEC);
-       return rt_period_us;
-}
-
-static int sched_rt_global_constraints(void)
-{
-       u64 runtime, period;
-       int ret = 0;
-
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       runtime = global_rt_runtime();
-       period = global_rt_period();
-
-       /*
-        * Sanity check on the sysctl variables.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
-
-       mutex_lock(&rt_constraints_mutex);
-       read_lock(&tasklist_lock);
-       ret = __rt_schedulable(NULL, 0, 0);
-       read_unlock(&tasklist_lock);
-       mutex_unlock(&rt_constraints_mutex);
-
-       return ret;
-}
-
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
-       /* Don't accept realtime tasks when there is no way for them to run */
-       if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
-               return 0;
-
-       return 1;
-}
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
-{
-       unsigned long flags;
-       int i;
-
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       /*
-        * There's always some RT tasks in the root group
-        * -- migration, kstopmachine etc..
-        */
-       if (sysctl_sched_rt_runtime == 0)
-               return -EBUSY;
-
-       raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-       for_each_possible_cpu(i) {
-               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               rt_rq->rt_runtime = global_rt_runtime();
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-       }
-       raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
-       return 0;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-int sched_rt_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
-{
-       int ret;
-       int old_period, old_runtime;
-       static DEFINE_MUTEX(mutex);
-
-       mutex_lock(&mutex);
-       old_period = sysctl_sched_rt_period;
-       old_runtime = sysctl_sched_rt_runtime;
-
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-       if (!ret && write) {
-               ret = sched_rt_global_constraints();
-               if (ret) {
-                       sysctl_sched_rt_period = old_period;
-                       sysctl_sched_rt_runtime = old_runtime;
-               } else {
-                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
-                       def_rt_bandwidth.rt_period =
-                               ns_to_ktime(global_rt_period());
-               }
-       }
-       mutex_unlock(&mutex);
-
-       return ret;
-}
-
-#ifdef CONFIG_CGROUP_SCHED
-
-/* return corresponding task_group object of a cgroup */
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
-{
-       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
-                           struct task_group, css);
-}
-
-static struct cgroup_subsys_state *
-cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       struct task_group *tg, *parent;
-
-       if (!cgrp->parent) {
-               /* This is early initialization for the top cgroup */
-               return &root_task_group.css;
-       }
-
-       parent = cgroup_tg(cgrp->parent);
-       tg = sched_create_group(parent);
-       if (IS_ERR(tg))
-               return ERR_PTR(-ENOMEM);
-
-       return &tg->css;
-}
-
-static void
-cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       struct task_group *tg = cgroup_tg(cgrp);
-
-       sched_destroy_group(tg);
-}
-
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-               return -EINVAL;
-#else
-       /* We don't support RT-tasks being in separate groups */
-       if (tsk->sched_class != &fair_sched_class)
-               return -EINVAL;
-#endif
-       return 0;
-}
-
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-       sched_move_task(tsk);
-}
-
-static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
-{
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
-
-       sched_move_task(task);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                               u64 shareval)
-{
-       return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
-}
-
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
-{
-       struct task_group *tg = cgroup_tg(cgrp);
-
-       return (u64) scale_load_down(tg->shares);
-}
-
-#ifdef CONFIG_CFS_BANDWIDTH
-static DEFINE_MUTEX(cfs_constraints_mutex);
-
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-
-static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
-{
-       int i, ret = 0, runtime_enabled;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-
-       if (tg == &root_task_group)
-               return -EINVAL;
-
-       /*
-        * Ensure we have at some amount of bandwidth every period.  This is
-        * to prevent reaching a state of large arrears when throttled via
-        * entity_tick() resulting in prolonged exit starvation.
-        */
-       if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
-               return -EINVAL;
-
-       /*
-        * Likewise, bound things on the otherside by preventing insane quota
-        * periods.  This also allows us to normalize in computing quota
-        * feasibility.
-        */
-       if (period > max_cfs_quota_period)
-               return -EINVAL;
-
-       mutex_lock(&cfs_constraints_mutex);
-       ret = __cfs_schedulable(tg, period, quota);
-       if (ret)
-               goto out_unlock;
-
-       runtime_enabled = quota != RUNTIME_INF;
-       raw_spin_lock_irq(&cfs_b->lock);
-       cfs_b->period = ns_to_ktime(period);
-       cfs_b->quota = quota;
-
-       __refill_cfs_bandwidth_runtime(cfs_b);
-       /* restart the period timer (if active) to handle new period expiry */
-       if (runtime_enabled && cfs_b->timer_active) {
-               /* force a reprogram */
-               cfs_b->timer_active = 0;
-               __start_cfs_bandwidth(cfs_b);
-       }
-       raw_spin_unlock_irq(&cfs_b->lock);
-
-       for_each_possible_cpu(i) {
-               struct cfs_rq *cfs_rq = tg->cfs_rq[i];
-               struct rq *rq = rq_of(cfs_rq);
-
-               raw_spin_lock_irq(&rq->lock);
-               cfs_rq->runtime_enabled = runtime_enabled;
-               cfs_rq->runtime_remaining = 0;
-
-               if (cfs_rq_throttled(cfs_rq))
-                       unthrottle_cfs_rq(cfs_rq);
-               raw_spin_unlock_irq(&rq->lock);
-       }
-out_unlock:
-       mutex_unlock(&cfs_constraints_mutex);
-
-       return ret;
-}
-
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
-{
-       u64 quota, period;
-
-       period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
-       if (cfs_quota_us < 0)
-               quota = RUNTIME_INF;
-       else
-               quota = (u64)cfs_quota_us * NSEC_PER_USEC;
-
-       return tg_set_cfs_bandwidth(tg, period, quota);
-}
-
-long tg_get_cfs_quota(struct task_group *tg)
-{
-       u64 quota_us;
-
-       if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
-               return -1;
-
-       quota_us = tg_cfs_bandwidth(tg)->quota;
-       do_div(quota_us, NSEC_PER_USEC);
-
-       return quota_us;
-}
-
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
-       u64 quota, period;
-
-       period = (u64)cfs_period_us * NSEC_PER_USEC;
-       quota = tg_cfs_bandwidth(tg)->quota;
-
-       if (period <= 0)
-               return -EINVAL;
-
-       return tg_set_cfs_bandwidth(tg, period, quota);
-}
-
-long tg_get_cfs_period(struct task_group *tg)
-{
-       u64 cfs_period_us;
-
-       cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
-       do_div(cfs_period_us, NSEC_PER_USEC);
-
-       return cfs_period_us;
-}
-
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
-{
-       return tg_get_cfs_quota(cgroup_tg(cgrp));
-}
-
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
-                               s64 cfs_quota_us)
-{
-       return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
-}
-
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
-{
-       return tg_get_cfs_period(cgroup_tg(cgrp));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
-                               u64 cfs_period_us)
-{
-       return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
-}
-
-struct cfs_schedulable_data {
-       struct task_group *tg;
-       u64 period, quota;
-};
-
-/*
- * normalize group quota/period to be quota/max_period
- * note: units are usecs
- */
-static u64 normalize_cfs_quota(struct task_group *tg,
-                              struct cfs_schedulable_data *d)
-{
-       u64 quota, period;
-
-       if (tg == d->tg) {
-               period = d->period;
-               quota = d->quota;
-       } else {
-               period = tg_get_cfs_period(tg);
-               quota = tg_get_cfs_quota(tg);
-       }
-
-       /* note: these should typically be equivalent */
-       if (quota == RUNTIME_INF || quota == -1)
-               return RUNTIME_INF;
-
-       return to_ratio(period, quota);
-}
-
-static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
-{
-       struct cfs_schedulable_data *d = data;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       s64 quota = 0, parent_quota = -1;
-
-       if (!tg->parent) {
-               quota = RUNTIME_INF;
-       } else {
-               struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
-
-               quota = normalize_cfs_quota(tg, d);
-               parent_quota = parent_b->hierarchal_quota;
-
-               /*
-                * ensure max(child_quota) <= parent_quota, inherit when no
-                * limit is set
-                */
-               if (quota == RUNTIME_INF)
-                       quota = parent_quota;
-               else if (parent_quota != RUNTIME_INF && quota > parent_quota)
-                       return -EINVAL;
-       }
-       cfs_b->hierarchal_quota = quota;
-
-       return 0;
-}
-
-static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
-{
-       int ret;
-       struct cfs_schedulable_data data = {
-               .tg = tg,
-               .period = period,
-               .quota = quota,
-       };
-
-       if (quota != RUNTIME_INF) {
-               do_div(data.period, NSEC_PER_USEC);
-               do_div(data.quota, NSEC_PER_USEC);
-       }
-
-       rcu_read_lock();
-       ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
-               struct cgroup_map_cb *cb)
-{
-       struct task_group *tg = cgroup_tg(cgrp);
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-
-       cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-       cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-       cb->fill(cb, "throttled_time", cfs_b->throttled_time);
-
-       return 0;
-}
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
-                               s64 val)
-{
-       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
-}
-
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       return sched_group_rt_runtime(cgroup_tg(cgrp));
-}
-
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-               u64 rt_period_us)
-{
-       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
-}
-
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
-{
-       return sched_group_rt_period(cgroup_tg(cgrp));
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       {
-               .name = "shares",
-               .read_u64 = cpu_shares_read_u64,
-               .write_u64 = cpu_shares_write_u64,
-       },
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
-       {
-               .name = "cfs_quota_us",
-               .read_s64 = cpu_cfs_quota_read_s64,
-               .write_s64 = cpu_cfs_quota_write_s64,
-       },
-       {
-               .name = "cfs_period_us",
-               .read_u64 = cpu_cfs_period_read_u64,
-               .write_u64 = cpu_cfs_period_write_u64,
-       },
-       {
-               .name = "stat",
-               .read_map = cpu_stats_show,
-       },
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-       {
-               .name = "rt_runtime_us",
-               .read_s64 = cpu_rt_runtime_read,
-               .write_s64 = cpu_rt_runtime_write,
-       },
-       {
-               .name = "rt_period_us",
-               .read_u64 = cpu_rt_period_read_uint,
-               .write_u64 = cpu_rt_period_write_uint,
-       },
-#endif
-};
-
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
-struct cgroup_subsys cpu_cgroup_subsys = {
-       .name           = "cpu",
-       .create         = cpu_cgroup_create,
-       .destroy        = cpu_cgroup_destroy,
-       .can_attach_task = cpu_cgroup_can_attach_task,
-       .attach_task    = cpu_cgroup_attach_task,
-       .exit           = cpu_cgroup_exit,
-       .populate       = cpu_cgroup_populate,
-       .subsys_id      = cpu_cgroup_subsys_id,
-       .early_init     = 1,
-};
-
-#endif /* CONFIG_CGROUP_SCHED */
-
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-       struct cgroup_subsys_state css;
-       /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 __percpu *cpuusage;
-       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-       struct cpuacct *parent;
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                           struct cpuacct, css);
-}
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(
-       struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-       int i;
-
-       if (!ca)
-               goto out;
-
-       ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage)
-               goto out_free_ca;
-
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               if (percpu_counter_init(&ca->cpustat[i], 0))
-                       goto out_free_counters;
-
-       if (cgrp->parent)
-               ca->parent = cgroup_ca(cgrp->parent);
-
-       return &ca->css;
-
-out_free_counters:
-       while (--i >= 0)
-               percpu_counter_destroy(&ca->cpustat[i]);
-       free_percpu(ca->cpuusage);
-out_free_ca:
-       kfree(ca);
-out:
-       return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void
-cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
-
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
-               percpu_counter_destroy(&ca->cpustat[i]);
-       free_percpu(ca->cpuusage);
-       kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-       u64 data;
-
-#ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       data = *cpuusage;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       data = *cpuusage;
-#endif
-
-       return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
-       /*
-        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-        */
-       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-       *cpuusage = val;
-       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-       *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       u64 totalcpuusage = 0;
-       int i;
-
-       for_each_present_cpu(i)
-               totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
-       return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                               u64 reset)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int err = 0;
-       int i;
-
-       if (reset) {
-               err = -EINVAL;
-               goto out;
-       }
-
-       for_each_present_cpu(i)
-               cpuacct_cpuusage_write(ca, i, 0);
-
-out:
-       return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                  struct seq_file *m)
-{
-       struct cpuacct *ca = cgroup_ca(cgroup);
-       u64 percpu;
-       int i;
-
-       for_each_present_cpu(i) {
-               percpu = cpuacct_cpuusage_read(ca, i);
-               seq_printf(m, "%llu ", (unsigned long long) percpu);
-       }
-       seq_printf(m, "\n");
-       return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
-       [CPUACCT_STAT_USER] = "user",
-       [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-               struct cgroup_map_cb *cb)
-{
-       struct cpuacct *ca = cgroup_ca(cgrp);
-       int i;
-
-       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
-               s64 val = percpu_counter_read(&ca->cpustat[i]);
-               val = cputime64_to_clock_t(val);
-               cb->fill(cb, cpuacct_stat_desc[i], val);
-       }
-       return 0;
-}
-
-static struct cftype files[] = {
-       {
-               .name = "usage",
-               .read_u64 = cpuusage_read,
-               .write_u64 = cpuusage_write,
-       },
-       {
-               .name = "usage_percpu",
-               .read_seq_string = cpuacct_percpu_seq_read,
-       },
-       {
-               .name = "stat",
-               .read_map = cpuacct_stats_show,
-       },
-};
-
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-       struct cpuacct *ca;
-       int cpu;
-
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       cpu = task_cpu(tsk);
-
-       rcu_read_lock();
-
-       ca = task_ca(tsk);
-
-       for (; ca; ca = ca->parent) {
-               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-               *cpuusage += cputime;
-       }
-
-       rcu_read_unlock();
-}
-
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH  \
-       min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH  0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-static void cpuacct_update_stats(struct task_struct *tsk,
-               enum cpuacct_stat_index idx, cputime_t val)
-{
-       struct cpuacct *ca;
-       int batch = CPUACCT_BATCH;
-
-       if (unlikely(!cpuacct_subsys.active))
-               return;
-
-       rcu_read_lock();
-       ca = task_ca(tsk);
-
-       do {
-               __percpu_counter_add(&ca->cpustat[idx], val, batch);
-               ca = ca->parent;
-       } while (ca);
-       rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
-       .name = "cpuacct",
-       .create = cpuacct_create,
-       .destroy = cpuacct_destroy,
-       .populate = cpuacct_populate,
-       .subsys_id = cpuacct_subsys_id,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644 (file)
index 0000000..9a7dd35
--- /dev/null
@@ -0,0 +1,20 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = -pg
+endif
+
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
+
+
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
new file mode 100644 (file)
index 0000000..e8a1f83
--- /dev/null
@@ -0,0 +1,258 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include "sched.h"
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+void __init autogroup_init(struct task_struct *init_task)
+{
+       autogroup_default.tg = &root_task_group;
+       kref_init(&autogroup_default.kref);
+       init_rwsem(&autogroup_default.lock);
+       init_task->signal->autogroup = &autogroup_default;
+}
+
+void autogroup_free(struct task_group *tg)
+{
+       kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+       struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       /* We've redirected RT tasks to the root task group... */
+       ag->tg->rt_se = NULL;
+       ag->tg->rt_rq = NULL;
+#endif
+       sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+       kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+       kref_get(&ag->kref);
+       return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+       struct autogroup *ag;
+       unsigned long flags;
+
+       if (!lock_task_sighand(p, &flags))
+               return autogroup_kref_get(&autogroup_default);
+
+       ag = autogroup_kref_get(p->signal->autogroup);
+       unlock_task_sighand(p, &flags);
+
+       return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+       struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+       struct task_group *tg;
+
+       if (!ag)
+               goto out_fail;
+
+       tg = sched_create_group(&root_task_group);
+
+       if (IS_ERR(tg))
+               goto out_free;
+
+       kref_init(&ag->kref);
+       init_rwsem(&ag->lock);
+       ag->id = atomic_inc_return(&autogroup_seq_nr);
+       ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+       /*
+        * Autogroup RT tasks are redirected to the root task group
+        * so we don't have to move tasks around upon policy change,
+        * or flail around trying to allocate bandwidth on the fly.
+        * A bandwidth exception in __sched_setscheduler() allows
+        * the policy change to proceed.  Thereafter, task_group()
+        * returns &root_task_group, so zero bandwidth is required.
+        */
+       free_rt_sched_group(tg);
+       tg->rt_se = root_task_group.rt_se;
+       tg->rt_rq = root_task_group.rt_rq;
+#endif
+       tg->autogroup = ag;
+
+       return ag;
+
+out_free:
+       kfree(ag);
+out_fail:
+       if (printk_ratelimit()) {
+               printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                       ag ? "sched_create_group()" : "kmalloc()");
+       }
+
+       return autogroup_kref_get(&autogroup_default);
+}
+
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+       if (tg != &root_task_group)
+               return false;
+
+       if (p->sched_class != &fair_sched_class)
+               return false;
+
+       /*
+        * We can only assume the task group can't go away on us if
+        * autogroup_move_group() can see us on ->thread_group list.
+        */
+       if (p->flags & PF_EXITING)
+               return false;
+
+       return true;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+       struct autogroup *prev;
+       struct task_struct *t;
+       unsigned long flags;
+
+       BUG_ON(!lock_task_sighand(p, &flags));
+
+       prev = p->signal->autogroup;
+       if (prev == ag) {
+               unlock_task_sighand(p, &flags);
+               return;
+       }
+
+       p->signal->autogroup = autogroup_kref_get(ag);
+
+       if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+               goto out;
+
+       t = p;
+       do {
+               sched_move_task(t);
+       } while_each_thread(p, t);
+
+out:
+       unlock_task_sighand(p, &flags);
+       autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+       struct autogroup *ag = autogroup_create();
+
+       autogroup_move_group(p, ag);
+       /* drop extra reference added by autogroup_create() */
+       autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+       autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+       sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+       autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+       sysctl_sched_autogroup_enabled = 0;
+
+       return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+       static unsigned long next = INITIAL_JIFFIES;
+       struct autogroup *ag;
+       int err;
+
+       if (*nice < -20 || *nice > 19)
+               return -EINVAL;
+
+       err = security_task_setnice(current, *nice);
+       if (err)
+               return err;
+
+       if (*nice < 0 && !can_nice(current, *nice))
+               return -EPERM;
+
+       /* this is a heavy operation taking global locks.. */
+       if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+               return -EAGAIN;
+
+       next = HZ / 10 + jiffies;
+       ag = autogroup_task_get(p);
+
+       down_write(&ag->lock);
+       err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+       if (!err)
+               ag->nice = *nice;
+       up_write(&ag->lock);
+
+       autogroup_kref_put(ag);
+
+       return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+       struct autogroup *ag = autogroup_task_get(p);
+
+       if (!task_group_is_autogroup(ag->tg))
+               goto out;
+
+       down_read(&ag->lock);
+       seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+       up_read(&ag->lock);
+
+out:
+       autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       if (!task_group_is_autogroup(tg))
+               return 0;
+
+       return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
new file mode 100644 (file)
index 0000000..8bd0471
--- /dev/null
@@ -0,0 +1,64 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+
+struct autogroup {
+       /*
+        * reference doesn't mean how many thread attach to this
+        * autogroup now. It just stands for the number of task
+        * could use this autogroup.
+        */
+       struct kref             kref;
+       struct task_group       *tg;
+       struct rw_semaphore     lock;
+       unsigned long           id;
+       int                     nice;
+};
+
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+       return !!tg->autogroup;
+}
+
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+       if (enabled && task_wants_autogroup(p, tg))
+               return p->signal->autogroup->tg;
+
+       return tg;
+}
+
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+       return 0;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
new file mode 100644 (file)
index 0000000..c685e31
--- /dev/null
@@ -0,0 +1,350 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ *  Updates and enhancements:
+ *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
+ *
+ * Based on code by:
+ *   Ingo Molnar <mingo@redhat.com>
+ *   Guillaume Chazarain <guichaz@gmail.com>
+ *
+ *
+ * What:
+ *
+ * cpu_clock(i) provides a fast (execution time) high resolution
+ * clock with bounded drift between CPUs. The value of cpu_clock(i)
+ * is monotonic for constant i. The timestamp returned is in nanoseconds.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ *
+ * There is no strict promise about the base, although it tends to start
+ * at 0 on boot (but people really shouldn't rely on that).
+ *
+ * cpu_clock(i)       -- can be used from any context, including NMI.
+ * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
+ * local_clock()      -- is cpu_clock() on the current cpu.
+ *
+ * How:
+ *
+ * The implementation either uses sched_clock() when
+ * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
+ * sched_clock() is assumed to provide these properties (mostly it means
+ * the architecture provides a globally synchronized highres time source).
+ *
+ * Otherwise it tries to create a semi stable clock from a mixture of other
+ * clocks, including:
+ *
+ *  - GTOD (clock monotomic)
+ *  - sched_clock()
+ *  - explicit idle events
+ *
+ * We use GTOD as base and use sched_clock() deltas to improve resolution. The
+ * deltas are filtered to provide monotonicity and keeping it within an
+ * expected window.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ *
+ * Notes:
+ *
+ * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
+ * like cpufreq interrupts that can change the base clock (TSC) multiplier
+ * and cause funny jumps in time -- although the filtering provided by
+ * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
+ * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
+ * sched_clock().
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+       return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+                                       * (NSEC_PER_SEC / HZ);
+}
+EXPORT_SYMBOL_GPL(sched_clock);
+
+__read_mostly int sched_clock_running;
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
+
+struct sched_clock_data {
+       u64                     tick_raw;
+       u64                     tick_gtod;
+       u64                     clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+       return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+       return &per_cpu(sched_clock_data, cpu);
+}
+
+void sched_clock_init(void)
+{
+       u64 ktime_now = ktime_to_ns(ktime_get());
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct sched_clock_data *scd = cpu_sdc(cpu);
+
+               scd->tick_raw = 0;
+               scd->tick_gtod = ktime_now;
+               scd->clock = ktime_now;
+       }
+
+       sched_clock_running = 1;
+}
+
+/*
+ * min, max except they take wrapping into account
+ */
+
+static inline u64 wrap_min(u64 x, u64 y)
+{
+       return (s64)(x - y) < 0 ? x : y;
+}
+
+static inline u64 wrap_max(u64 x, u64 y)
+{
+       return (s64)(x - y) > 0 ? x : y;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ *  - filter out backward motion
+ *  - use the GTOD tick value to create a window to filter crazy TSC values
+ */
+static u64 sched_clock_local(struct sched_clock_data *scd)
+{
+       u64 now, clock, old_clock, min_clock, max_clock;
+       s64 delta;
+
+again:
+       now = sched_clock();
+       delta = now - scd->tick_raw;
+       if (unlikely(delta < 0))
+               delta = 0;
+
+       old_clock = scd->clock;
+
+       /*
+        * scd->clock = clamp(scd->tick_gtod + delta,
+        *                    max(scd->tick_gtod, scd->clock),
+        *                    scd->tick_gtod + TICK_NSEC);
+        */
+
+       clock = scd->tick_gtod + delta;
+       min_clock = wrap_max(scd->tick_gtod, old_clock);
+       max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+
+       clock = wrap_max(clock, min_clock);
+       clock = wrap_min(clock, max_clock);
+
+       if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+               goto again;
+
+       return clock;
+}
+
+static u64 sched_clock_remote(struct sched_clock_data *scd)
+{
+       struct sched_clock_data *my_scd = this_scd();
+       u64 this_clock, remote_clock;
+       u64 *ptr, old_val, val;
+
+       sched_clock_local(my_scd);
+again:
+       this_clock = my_scd->clock;
+       remote_clock = scd->clock;
+
+       /*
+        * Use the opportunity that we have both locks
+        * taken to couple the two clocks: we take the
+        * larger time as the latest time for both
+        * runqueues. (this creates monotonic movement)
+        */
+       if (likely((s64)(remote_clock - this_clock) < 0)) {
+               ptr = &scd->clock;
+               old_val = remote_clock;
+               val = this_clock;
+       } else {
+               /*
+                * Should be rare, but possible:
+                */
+               ptr = &my_scd->clock;
+               old_val = this_clock;
+               val = remote_clock;
+       }
+
+       if (cmpxchg64(ptr, old_val, val) != old_val)
+               goto again;
+
+       return val;
+}
+
+/*
+ * Similar to cpu_clock(), but requires local IRQs to be disabled.
+ *
+ * See cpu_clock().
+ */
+u64 sched_clock_cpu(int cpu)
+{
+       struct sched_clock_data *scd;
+       u64 clock;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       if (sched_clock_stable)
+               return sched_clock();
+
+       if (unlikely(!sched_clock_running))
+               return 0ull;
+
+       scd = cpu_sdc(cpu);
+
+       if (cpu != smp_processor_id())
+               clock = sched_clock_remote(scd);
+       else
+               clock = sched_clock_local(scd);
+
+       return clock;
+}
+
+void sched_clock_tick(void)
+{
+       struct sched_clock_data *scd;
+       u64 now, now_gtod;
+
+       if (sched_clock_stable)
+               return;
+
+       if (unlikely(!sched_clock_running))
+               return;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       scd = this_scd();
+       now_gtod = ktime_to_ns(ktime_get());
+       now = sched_clock();
+
+       scd->tick_raw = now;
+       scd->tick_gtod = now_gtod;
+       sched_clock_local(scd);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+       sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+       if (timekeeping_suspended)
+               return;
+
+       sched_clock_tick();
+       touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+/*
+ * As outlined at the top, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+u64 cpu_clock(int cpu)
+{
+       u64 clock;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       clock = sched_clock_cpu(cpu);
+       local_irq_restore(flags);
+
+       return clock;
+}
+
+/*
+ * Similar to cpu_clock() for the current cpu. Time will only be observed
+ * to be monotonic if care is taken to only compare timestampt taken on the
+ * same CPU.
+ *
+ * See cpu_clock().
+ */
+u64 local_clock(void)
+{
+       u64 clock;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       clock = sched_clock_cpu(smp_processor_id());
+       local_irq_restore(flags);
+
+       return clock;
+}
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+       sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+       if (unlikely(!sched_clock_running))
+               return 0;
+
+       return sched_clock();
+}
+
+u64 cpu_clock(int cpu)
+{
+       return sched_clock_cpu(cpu);
+}
+
+u64 local_clock(void)
+{
+       return sched_clock_cpu(0);
+}
+
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+EXPORT_SYMBOL_GPL(cpu_clock);
+EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
new file mode 100644 (file)
index 0000000..4dbfd04
--- /dev/null
@@ -0,0 +1,8152 @@
+/*
+ *  kernel/sched/core.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *             make semaphores SMP safe
+ *  1998-11-19 Implemented schedule_timeout() and related stuff
+ *             by Andrea Arcangeli
+ *  2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *             hybrid priority-list and round-robin design with
+ *             an array-switch method of distributing timeslices
+ *             and per-CPU runqueues.  Cleanups and useful suggestions
+ *             by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03 Interactivity tuning by Con Kolivas.
+ *  2004-04-02 Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *              fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *              by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/uaccess.h>
+#include <linux/highmem.h>
+#include <asm/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/pid_namespace.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/sysctl.h>
+#include <linux/syscalls.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/unistd.h>
+#include <linux/pagemap.h>
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+#include <linux/debugfs.h>
+#include <linux/ctype.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+
+#include <asm/tlb.h>
+#include <asm/irq_regs.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "sched.h"
+#include "../workqueue_sched.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+{
+       unsigned long delta;
+       ktime_t soft, hard, now;
+
+       for (;;) {
+               if (hrtimer_active(period_timer))
+                       break;
+
+               now = hrtimer_cb_get_time(period_timer);
+               hrtimer_forward(period_timer, now, period);
+
+               soft = hrtimer_get_softexpires(period_timer);
+               hard = hrtimer_get_expires(period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(period_timer, soft, delta,
+                                        HRTIMER_MODE_ABS_PINNED, 0);
+       }
+}
+
+DEFINE_MUTEX(sched_domains_mutex);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+void update_rq_clock(struct rq *rq)
+{
+       s64 delta;
+
+       if (rq->skip_clock_update > 0)
+               return;
+
+       delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+       rq->clock += delta;
+       update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Debugging: various feature bits
+ */
+
+#define SCHED_FEAT(name, enabled)      \
+       (1UL << __SCHED_FEAT_##name) * enabled |
+
+const_debug unsigned int sysctl_sched_features =
+#include "features.h"
+       0;
+
+#undef SCHED_FEAT
+
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_FEAT(name, enabled)      \
+       #name ,
+
+static __read_mostly char *sched_feat_names[] = {
+#include "features.h"
+       NULL
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+       int i;
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
+       }
+       seq_puts(m, "\n");
+
+       return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  jump_label_key_enabled
+#define jump_label_key__false jump_label_key_disabled
+
+#define SCHED_FEAT(name, enabled)      \
+       jump_label_key__##enabled ,
+
+struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+       if (jump_label_enabled(&sched_feat_keys[i]))
+               jump_label_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+       if (!jump_label_enabled(&sched_feat_keys[i]))
+               jump_label_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+               size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       char *cmp;
+       int neg = 0;
+       int i;
+
+       if (cnt > 63)
+               cnt = 63;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+       cmp = strstrip(buf);
+
+       if (strncmp(cmp, "NO_", 3) == 0) {
+               neg = 1;
+               cmp += 3;
+       }
+
+       for (i = 0; i < __SCHED_FEAT_NR; i++) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
+                       if (neg) {
+                               sysctl_sched_features &= ~(1UL << i);
+                               sched_feat_disable(i);
+                       } else {
+                               sysctl_sched_features |= (1UL << i);
+                               sched_feat_enable(i);
+                       }
+                       break;
+               }
+       }
+
+       if (i == __SCHED_FEAT_NR)
+               return -EINVAL;
+
+       *ppos += cnt;
+
+       return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+       debugfs_create_file("sched_features", 0644, NULL, NULL,
+                       &sched_feat_fops);
+
+       return 0;
+}
+late_initcall(sched_init_debug);
+#endif /* CONFIG_SCHED_DEBUG */
+
+/*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
+ * period over which we average the RT time consumption, measured
+ * in ms.
+ *
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+
+/*
+ * period over which we measure -rt task cpu usage in us.
+ * default: 1s
+ */
+unsigned int sysctl_sched_rt_period = 1000000;
+
+__read_mostly int scheduler_running;
+
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
+
+
+
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       lockdep_assert_held(&p->pi_lock);
+
+       for (;;) {
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p)))
+                       return rq;
+               raw_spin_unlock(&rq->lock);
+       }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+       __acquires(p->pi_lock)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       for (;;) {
+               raw_spin_lock_irqsave(&p->pi_lock, *flags);
+               rq = task_rq(p);
+               raw_spin_lock(&rq->lock);
+               if (likely(rq == task_rq(p)))
+                       return rq;
+               raw_spin_unlock(&rq->lock);
+               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+       }
+}
+
+static void __task_rq_unlock(struct rq *rq)
+       __releases(rq->lock)
+{
+       raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
+{
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
+/*
+ * this_rq_lock - lock this runqueue and disable interrupts.
+ */
+static struct rq *this_rq_lock(void)
+       __acquires(rq->lock)
+{
+       struct rq *rq;
+
+       local_irq_disable();
+       rq = this_rq();
+       raw_spin_lock(&rq->lock);
+
+       return rq;
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+
+static void hrtick_clear(struct rq *rq)
+{
+       if (hrtimer_active(&rq->hrtick_timer))
+               hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+       struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+       WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+       raw_spin_lock(&rq->lock);
+       update_rq_clock(rq);
+       rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+       raw_spin_unlock(&rq->lock);
+
+       return HRTIMER_NORESTART;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * called from hardirq (IPI) context
+ */
+static void __hrtick_start(void *arg)
+{
+       struct rq *rq = arg;
+
+       raw_spin_lock(&rq->lock);
+       hrtimer_restart(&rq->hrtick_timer);
+       rq->hrtick_csd_pending = 0;
+       raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+       struct hrtimer *timer = &rq->hrtick_timer;
+       ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+
+       hrtimer_set_expires(timer, time);
+
+       if (rq == this_rq()) {
+               hrtimer_restart(timer);
+       } else if (!rq->hrtick_csd_pending) {
+               __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+               rq->hrtick_csd_pending = 1;
+       }
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+       int cpu = (int)(long)hcpu;
+
+       switch (action) {
+       case CPU_UP_CANCELED:
+       case CPU_UP_CANCELED_FROZEN:
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               hrtick_clear(cpu_rq(cpu));
+               return NOTIFY_OK;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static __init void init_hrtick(void)
+{
+       hotcpu_notifier(hotplug_hrtick, 0);
+}
+#else
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+void hrtick_start(struct rq *rq, u64 delay)
+{
+       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                       HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SMP */
+
+static void init_rq_hrtick(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       rq->hrtick_csd_pending = 0;
+
+       rq->hrtick_csd.flags = 0;
+       rq->hrtick_csd.func = __hrtick_start;
+       rq->hrtick_csd.info = rq;
+#endif
+
+       hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       rq->hrtick_timer.function = hrtick;
+}
+#else  /* CONFIG_SCHED_HRTICK */
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+static inline void init_hrtick(void)
+{
+}
+#endif /* CONFIG_SCHED_HRTICK */
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+void resched_task(struct task_struct *p)
+{
+       int cpu;
+
+       assert_raw_spin_locked(&task_rq(p)->lock);
+
+       if (test_tsk_need_resched(p))
+               return;
+
+       set_tsk_need_resched(p);
+
+       cpu = task_cpu(p);
+       if (cpu == smp_processor_id())
+               return;
+
+       /* NEED_RESCHED must be visible before we test polling */
+       smp_mb();
+       if (!tsk_is_polling(p))
+               smp_send_reschedule(cpu);
+}
+
+void resched_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+               return;
+       resched_task(cpu_curr(cpu));
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+       int cpu = smp_processor_id();
+       int i;
+       struct sched_domain *sd;
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               for_each_cpu(i, sched_domain_span(sd)) {
+                       if (!idle_cpu(i)) {
+                               cpu = i;
+                               goto unlock;
+                       }
+               }
+       }
+unlock:
+       rcu_read_unlock();
+       return cpu;
+}
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (cpu == smp_processor_id())
+               return;
+
+       /*
+        * This is safe, as this function is called with the timer
+        * wheel base lock of (cpu) held. When the CPU is on the way
+        * to idle and has not yet set rq->curr to idle then it will
+        * be serialized on the timer wheel base lock and take the new
+        * timer into account automatically.
+        */
+       if (rq->curr != rq->idle)
+               return;
+
+       /*
+        * We can set TIF_RESCHED on the idle task of the other CPU
+        * lockless. The worst case is that the other CPU runs the
+        * idle task through an additional NOOP schedule()
+        */
+       set_tsk_need_resched(rq->idle);
+
+       /* NEED_RESCHED must be visible before we test polling */
+       smp_mb();
+       if (!tsk_is_polling(rq->idle))
+               smp_send_reschedule(cpu);
+}
+
+static inline bool got_nohz_idle_kick(void)
+{
+       int cpu = smp_processor_id();
+       return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+}
+
+#else /* CONFIG_NO_HZ */
+
+static inline bool got_nohz_idle_kick(void)
+{
+       return false;
+}
+
+#endif /* CONFIG_NO_HZ */
+
+void sched_avg_update(struct rq *rq)
+{
+       s64 period = sched_avg_period();
+
+       while ((s64)(rq->clock - rq->age_stamp) > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (rq->age_stamp));
+               rq->age_stamp += period;
+               rq->rt_avg /= 2;
+       }
+}
+
+#else /* !CONFIG_SMP */
+void resched_task(struct task_struct *p)
+{
+       assert_raw_spin_locked(&task_rq(p)->lock);
+       set_tsk_need_resched(p);
+}
+#endif /* CONFIG_SMP */
+
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+                       (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+/*
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+int walk_tg_tree_from(struct task_group *from,
+                            tg_visitor down, tg_visitor up, void *data)
+{
+       struct task_group *parent, *child;
+       int ret;
+
+       parent = from;
+
+down:
+       ret = (*down)(parent, data);
+       if (ret)
+               goto out;
+       list_for_each_entry_rcu(child, &parent->children, siblings) {
+               parent = child;
+               goto down;
+
+up:
+               continue;
+       }
+       ret = (*up)(parent, data);
+       if (ret || parent == from)
+               goto out;
+
+       child = parent;
+       parent = parent->parent;
+       if (parent)
+               goto up;
+out:
+       return ret;
+}
+
+int tg_nop(struct task_group *tg, void *data)
+{
+       return 0;
+}
+#endif
+
+void update_cpu_load(struct rq *this_rq);
+
+static void set_load_weight(struct task_struct *p)
+{
+       int prio = p->static_prio - MAX_RT_PRIO;
+       struct load_weight *load = &p->se.load;
+
+       /*
+        * SCHED_IDLE tasks get minimal weight:
+        */
+       if (p->policy == SCHED_IDLE) {
+               load->weight = scale_load(WEIGHT_IDLEPRIO);
+               load->inv_weight = WMULT_IDLEPRIO;
+               return;
+       }
+
+       load->weight = scale_load(prio_to_weight[prio]);
+       load->inv_weight = prio_to_wmult[prio];
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+       update_rq_clock(rq);
+       sched_info_queued(p);
+       p->sched_class->enqueue_task(rq, p, flags);
+}
+
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+       update_rq_clock(rq);
+       sched_info_dequeued(p);
+       p->sched_class->dequeue_task(rq, p, flags);
+}
+
+/*
+ * activate_task - move a task to the runqueue.
+ */
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+       if (task_contributes_to_load(p))
+               rq->nr_uninterruptible--;
+
+       enqueue_task(rq, p, flags);
+}
+
+/*
+ * deactivate_task - remove a task from the runqueue.
+ */
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+{
+       if (task_contributes_to_load(p))
+               rq->nr_uninterruptible++;
+
+       dequeue_task(rq, p, flags);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+       __this_cpu_inc(irq_time_seq.sequence);
+       smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+       smp_wmb();
+       __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       u64 irq_time;
+       unsigned seq;
+
+       do {
+               seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+               irq_time = per_cpu(cpu_softirq_time, cpu) +
+                          per_cpu(cpu_hardirq_time, cpu);
+       } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+       return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       s64 delta;
+       int cpu;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+       __this_cpu_add(irq_start_time, delta);
+
+       irq_time_write_begin();
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               __this_cpu_add(cpu_hardirq_time, delta);
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+               __this_cpu_add(cpu_softirq_time, delta);
+
+       irq_time_write_end();
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+       if (unlikely(steal > NSEC_PER_SEC))
+               return div_u64(steal, TICK_NSEC);
+
+       return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+       /*
+        * Since irq_time is only updated on {soft,}irq_exit, we might run into
+        * this case when a previous update_rq_clock() happened inside a
+        * {soft,}irq region.
+        *
+        * When this happens, we stop ->clock_task and only update the
+        * prev_irq_time stamp to account for the part that fit, so that a next
+        * update will consume the rest. This ensures ->clock_task is
+        * monotonic.
+        *
+        * It does however cause some slight miss-attribution of {soft,}irq
+        * time, a more accurate solution would be to update the irq_time using
+        * the current rq->clock timestamp, except that would require using
+        * atomic ops.
+        */
+       if (irq_delta > delta)
+               irq_delta = delta;
+
+       rq->prev_irq_time += irq_delta;
+       delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       if (static_branch((&paravirt_steal_rq_enabled))) {
+               u64 st;
+
+               steal = paravirt_steal_clock(cpu_of(rq));
+               steal -= rq->prev_steal_time_rq;
+
+               if (unlikely(steal > delta))
+                       steal = delta;
+
+               st = steal_ticks(steal);
+               steal = st * TICK_NSEC;
+
+               rq->prev_steal_time_rq += steal;
+
+               delta -= steal;
+       }
+#endif
+
+       rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+       if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+               sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int irqtime_account_hi_update(void)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime    (0)
+
+#endif
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+               stop->sched_class = &stop_sched_class;
+       }
+
+       cpu_rq(cpu)->stop = stop;
+
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+}
+
+/*
+ * __normal_prio - return the priority that is based on the static prio
+ */
+static inline int __normal_prio(struct task_struct *p)
+{
+       return p->static_prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+       int prio;
+
+       if (task_has_rt_policy(p))
+               prio = MAX_RT_PRIO-1 - p->rt_priority;
+       else
+               prio = __normal_prio(p);
+       return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+       p->normal_prio = normal_prio(p);
+       /*
+        * If we are RT tasks or we were boosted to RT priority,
+        * keep the priority unchanged. Otherwise, update priority
+        * to the normal priority:
+        */
+       if (!rt_prio(p->prio))
+               return p->normal_prio;
+       return p->prio;
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+       return cpu_curr(task_cpu(p)) == p;
+}
+
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+                                      const struct sched_class *prev_class,
+                                      int oldprio)
+{
+       if (prev_class != p->sched_class) {
+               if (prev_class->switched_from)
+                       prev_class->switched_from(rq, p);
+               p->sched_class->switched_to(rq, p);
+       } else if (oldprio != p->prio)
+               p->sched_class->prio_changed(rq, p, oldprio);
+}
+
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+       const struct sched_class *class;
+
+       if (p->sched_class == rq->curr->sched_class) {
+               rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+       } else {
+               for_each_class(class) {
+                       if (class == rq->curr->sched_class)
+                               break;
+                       if (class == p->sched_class) {
+                               resched_task(rq->curr);
+                               break;
+                       }
+               }
+       }
+
+       /*
+        * A queue event has occurred, and we're going to schedule.  In
+        * this case, we can save a useless back to back clock update.
+        */
+       if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+               rq->skip_clock_update = 1;
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       /*
+        * We should never call set_task_cpu() on a blocked task,
+        * ttwu() will sort out the placement.
+        */
+       WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
+                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+       /*
+        * The caller should hold either p->pi_lock or rq->lock, when changing
+        * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+        *
+        * sched_move_task() holds both and thus holding either pins the cgroup,
+        * see set_task_rq().
+        *
+        * Furthermore, all task_rq users should acquire both locks, see
+        * task_rq_lock().
+        */
+       WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                     lockdep_is_held(&task_rq(p)->lock)));
+#endif
+#endif
+
+       trace_sched_migrate_task(p, new_cpu);
+
+       if (task_cpu(p) != new_cpu) {
+               p->se.nr_migrations++;
+               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+       }
+
+       __set_task_cpu(p, new_cpu);
+}
+
+struct migration_arg {
+       struct task_struct *task;
+       int dest_cpu;
+};
+
+static int migration_cpu_stop(void *data);
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+       unsigned long flags;
+       int running, on_rq;
+       unsigned long ncsw;
+       struct rq *rq;
+
+       for (;;) {
+               /*
+                * We do the initial early heuristics without holding
+                * any task-queue locks at all. We'll only try to get
+                * the runqueue lock when things look like they will
+                * work out!
+                */
+               rq = task_rq(p);
+
+               /*
+                * If the task is actively running on another CPU
+                * still, just relax and busy-wait without holding
+                * any locks.
+                *
+                * NOTE! Since we don't hold any locks, it's not
+                * even sure that "rq" stays as the right runqueue!
+                * But we don't care, since "task_running()" will
+                * return false if the runqueue has changed and p
+                * is actually now running somewhere else!
+                */
+               while (task_running(rq, p)) {
+                       if (match_state && unlikely(p->state != match_state))
+                               return 0;
+                       cpu_relax();
+               }
+
+               /*
+                * Ok, time to look more closely! We need the rq
+                * lock now, to be *sure*. If we're wrong, we'll
+                * just go back and repeat.
+                */
+               rq = task_rq_lock(p, &flags);
+               trace_sched_wait_task(p);
+               running = task_running(rq, p);
+               on_rq = p->on_rq;
+               ncsw = 0;
+               if (!match_state || p->state == match_state)
+                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+               task_rq_unlock(rq, p, &flags);
+
+               /*
+                * If it changed from the expected state, bail out now.
+                */
+               if (unlikely(!ncsw))
+                       break;
+
+               /*
+                * Was it really running after all now that we
+                * checked with the proper locks actually held?
+                *
+                * Oops. Go back and try again..
+                */
+               if (unlikely(running)) {
+                       cpu_relax();
+                       continue;
+               }
+
+               /*
+                * It's not enough that it's not actively running,
+                * it must be off the runqueue _entirely_, and not
+                * preempted!
+                *
+                * So if it was still runnable (but just not actively
+                * running right now), it's preempted, and we should
+                * yield - it could be a while.
+                */
+               if (unlikely(on_rq)) {
+                       ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+                       continue;
+               }
+
+               /*
+                * Ahh, all good. It wasn't running, and it wasn't
+                * runnable, which means that it will never become
+                * running in the future either. We're all done!
+                */
+               break;
+       }
+
+       return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+       int cpu;
+
+       preempt_disable();
+       cpu = task_cpu(p);
+       if ((cpu != smp_processor_id()) && task_curr(p))
+               smp_send_reschedule(cpu);
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ */
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+       int dest_cpu;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+               if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                       return dest_cpu;
+
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
+       if (dest_cpu < nr_cpu_ids)
+               return dest_cpu;
+
+       /* No more Mr. Nice Guy. */
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
+       }
+
+       return dest_cpu;
+}
+
+/*
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ */
+static inline
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+{
+       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+
+       /*
+        * In order not to call set_task_cpu() on a blocking task we need
+        * to rely on ttwu() to place the task on a valid ->cpus_allowed
+        * cpu.
+        *
+        * Since this is common to all placement strategies, this lives here.
+        *
+        * [ this allows ->select_task() to simply return task_cpu(p) and
+        *   not worry about this generic constraint ]
+        */
+       if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+                    !cpu_online(cpu)))
+               cpu = select_fallback_rq(task_cpu(p), p);
+
+       return cpu;
+}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+       s64 diff = sample - *avg;
+       *avg += diff >> 3;
+}
+#endif
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+       struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+       int this_cpu = smp_processor_id();
+
+       if (cpu == this_cpu) {
+               schedstat_inc(rq, ttwu_local);
+               schedstat_inc(p, se.statistics.nr_wakeups_local);
+       } else {
+               struct sched_domain *sd;
+
+               schedstat_inc(p, se.statistics.nr_wakeups_remote);
+               rcu_read_lock();
+               for_each_domain(this_cpu, sd) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                               schedstat_inc(sd, ttwu_wake_remote);
+                               break;
+                       }
+               }
+               rcu_read_unlock();
+       }
+
+       if (wake_flags & WF_MIGRATED)
+               schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+
+#endif /* CONFIG_SMP */
+
+       schedstat_inc(rq, ttwu_count);
+       schedstat_inc(p, se.statistics.nr_wakeups);
+
+       if (wake_flags & WF_SYNC)
+               schedstat_inc(p, se.statistics.nr_wakeups_sync);
+
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
+       activate_task(rq, p, en_flags);
+       p->on_rq = 1;
+
+       /* if a worker is waking up, notify workqueue */
+       if (p->flags & PF_WQ_WORKER)
+               wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+       trace_sched_wakeup(p, true);
+       check_preempt_curr(rq, p, wake_flags);
+
+       p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
+
+       if (rq->idle_stamp) {
+               u64 delta = rq->clock - rq->idle_stamp;
+               u64 max = 2*sysctl_sched_migration_cost;
+
+               if (delta > max)
+                       rq->avg_idle = max;
+               else
+                       update_avg(&rq->avg_idle, delta);
+               rq->idle_stamp = 0;
+       }
+#endif
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+       if (p->sched_contributes_to_load)
+               rq->nr_uninterruptible--;
+#endif
+
+       ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+       ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+       struct rq *rq;
+       int ret = 0;
+
+       rq = __task_rq_lock(p);
+       if (p->on_rq) {
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+
+       return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+       struct rq *rq = this_rq();
+       struct llist_node *llist = llist_del_all(&rq->wake_list);
+       struct task_struct *p;
+
+       raw_spin_lock(&rq->lock);
+
+       while (llist) {
+               p = llist_entry(llist, struct task_struct, wake_entry);
+               llist = llist_next(llist);
+               ttwu_do_activate(rq, p, 0);
+       }
+
+       raw_spin_unlock(&rq->lock);
+}
+
+void scheduler_ipi(void)
+{
+       if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+               return;
+
+       /*
+        * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+        * traditionally all their work was done from the interrupt return
+        * path. Now that we actually do some work, we need to make sure
+        * we do call them.
+        *
+        * Some archs already do call them, luckily irq_enter/exit nest
+        * properly.
+        *
+        * Arguably we should visit all archs and update all handlers,
+        * however a fair share of IPIs are still resched only so this would
+        * somewhat pessimize the simple resched case.
+        */
+       irq_enter();
+       sched_ttwu_pending();
+
+       /*
+        * Check if someone kicked us for doing the nohz idle load balance.
+        */
+       if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+               this_rq()->idle_balance = 1;
+               raise_softirq_irqoff(SCHED_SOFTIRQ);
+       }
+       irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+       if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
+               smp_send_reschedule(cpu);
+}
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+       struct rq *rq;
+       int ret = 0;
+
+       rq = __task_rq_lock(p);
+       if (p->on_cpu) {
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+               ttwu_do_wakeup(rq, p, wake_flags);
+               ret = 1;
+       }
+       __task_rq_unlock(rq);
+
+       return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+
+static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+{
+       return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+       if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+               sched_clock_cpu(cpu); /* sync clocks x-cpu */
+               ttwu_queue_remote(p, cpu);
+               return;
+       }
+#endif
+
+       raw_spin_lock(&rq->lock);
+       ttwu_do_activate(rq, p, 0);
+       raw_spin_unlock(&rq->lock);
+}
+
+/**
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+       unsigned long flags;
+       int cpu, success = 0;
+
+       smp_wmb();
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       if (!(p->state & state))
+               goto out;
+
+       success = 1; /* we're going to change ->state */
+       cpu = task_cpu(p);
+
+       if (p->on_rq && ttwu_remote(p, wake_flags))
+               goto stat;
+
+#ifdef CONFIG_SMP
+       /*
+        * If the owning (remote) cpu is still in the middle of schedule() with
+        * this task as prev, wait until its done referencing the task.
+        */
+       while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+               /*
+                * In case the architecture enables interrupts in
+                * context_switch(), we cannot busy wait, since that
+                * would lead to deadlocks when an interrupt hits and
+                * tries to wake up @prev. So bail and do a complete
+                * remote wakeup.
+                */
+               if (ttwu_activate_remote(p, wake_flags))
+                       goto stat;
+#else
+               cpu_relax();
+#endif
+       }
+       /*
+        * Pairs with the smp_wmb() in finish_lock_switch().
+        */
+       smp_rmb();
+
+       p->sched_contributes_to_load = !!task_contributes_to_load(p);
+       p->state = TASK_WAKING;
+
+       if (p->sched_class->task_waking)
+               p->sched_class->task_waking(p);
+
+       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       if (task_cpu(p) != cpu) {
+               wake_flags |= WF_MIGRATED;
+               set_task_cpu(p, cpu);
+       }
+#endif /* CONFIG_SMP */
+
+       ttwu_queue(p, cpu);
+stat:
+       ttwu_stat(p, cpu, wake_flags);
+out:
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+       return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+
+       BUG_ON(rq != this_rq());
+       BUG_ON(p == current);
+       lockdep_assert_held(&rq->lock);
+
+       if (!raw_spin_trylock(&p->pi_lock)) {
+               raw_spin_unlock(&rq->lock);
+               raw_spin_lock(&p->pi_lock);
+               raw_spin_lock(&rq->lock);
+       }
+
+       if (!(p->state & TASK_NORMAL))
+               goto out;
+
+       if (!p->on_rq)
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+       ttwu_do_wakeup(rq, p, 0);
+       ttwu_stat(p, smp_processor_id(), 0);
+out:
+       raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+       return try_to_wake_up(p, TASK_ALL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+       return try_to_wake_up(p, state, 0);
+}
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ *
+ * __sched_fork() is basic setup used by init_idle() too:
+ */
+static void __sched_fork(struct task_struct *p)
+{
+       p->on_rq                        = 0;
+
+       p->se.on_rq                     = 0;
+       p->se.exec_start                = 0;
+       p->se.sum_exec_runtime          = 0;
+       p->se.prev_sum_exec_runtime     = 0;
+       p->se.nr_migrations             = 0;
+       p->se.vruntime                  = 0;
+       INIT_LIST_HEAD(&p->se.group_node);
+
+#ifdef CONFIG_SCHEDSTATS
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+
+       INIT_LIST_HEAD(&p->rt.run_list);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+}
+
+/*
+ * fork()/clone()-time setup:
+ */
+void sched_fork(struct task_struct *p)
+{
+       unsigned long flags;
+       int cpu = get_cpu();
+
+       __sched_fork(p);
+       /*
+        * We mark the process as running here. This guarantees that
+        * nobody will actually run it, and a signal or other external
+        * event cannot wake it up and insert it on the runqueue either.
+        */
+       p->state = TASK_RUNNING;
+
+       /*
+        * Make sure we do not leak PI boosting priority to the child.
+        */
+       p->prio = current->normal_prio;
+
+       /*
+        * Revert to default priority/policy on fork if requested.
+        */
+       if (unlikely(p->sched_reset_on_fork)) {
+               if (task_has_rt_policy(p)) {
+                       p->policy = SCHED_NORMAL;
+                       p->static_prio = NICE_TO_PRIO(0);
+                       p->rt_priority = 0;
+               } else if (PRIO_TO_NICE(p->static_prio) < 0)
+                       p->static_prio = NICE_TO_PRIO(0);
+
+               p->prio = p->normal_prio = __normal_prio(p);
+               set_load_weight(p);
+
+               /*
+                * We don't need the reset flag anymore after the fork. It has
+                * fulfilled its duty:
+                */
+               p->sched_reset_on_fork = 0;
+       }
+
+       if (!rt_prio(p->prio))
+               p->sched_class = &fair_sched_class;
+
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
+
+       /*
+        * The child is not yet in the pid-hash so no cgroup attach races,
+        * and the cgroup is pinned to this child due to cgroup_fork()
+        * is ran before sched_fork().
+        *
+        * Silence PROVE_RCU.
+        */
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       set_task_cpu(p, cpu);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+       if (likely(sched_info_on()))
+               memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP)
+       p->on_cpu = 0;
+#endif
+#ifdef CONFIG_PREEMPT_COUNT
+       /* Want to start with kernel preemption disabled. */
+       task_thread_info(p)->preempt_count = 1;
+#endif
+#ifdef CONFIG_SMP
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
+
+       put_cpu();
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+#ifdef CONFIG_SMP
+       /*
+        * Fork balancing, do it here and not earlier because:
+        *  - cpus_allowed can change in the fork path
+        *  - any previously selected cpu might disappear through hotplug
+        */
+       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+#endif
+
+       rq = __task_rq_lock(p);
+       activate_task(rq, p, 0);
+       p->on_rq = 1;
+       trace_sched_wakeup_new(p, true);
+       check_preempt_curr(rq, p, WF_FORK);
+#ifdef CONFIG_SMP
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
+#endif
+       task_rq_unlock(rq, p, &flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+       hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+       hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+       struct preempt_notifier *notifier;
+       struct hlist_node *node;
+
+       hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+               notifier->ops->sched_out(notifier, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+                                struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+                   struct task_struct *next)
+{
+       sched_info_switch(prev, next);
+       perf_event_task_sched_out(prev, next);
+       fire_sched_out_preempt_notifiers(prev, next);
+       prepare_lock_switch(rq, next);
+       prepare_arch_switch(next);
+       trace_sched_switch(prev, next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock. (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ */
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+       __releases(rq->lock)
+{
+       struct mm_struct *mm = rq->prev_mm;
+       long prev_state;
+
+       rq->prev_mm = NULL;
+
+       /*
+        * A task struct has one reference for the use as "current".
+        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+        * schedule one last time. The schedule call will never return, and
+        * the scheduled task must drop that reference.
+        * The test for TASK_DEAD must occur while the runqueue locks are
+        * still held, otherwise prev could be scheduled on another cpu, die
+        * there before we look at prev->state, and then the reference would
+        * be dropped twice.
+        *              Manfred Spraul <manfred@colorfullife.com>
+        */
+       prev_state = prev->state;
+       finish_arch_switch(prev);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+       perf_event_task_sched_in(prev, current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+       finish_lock_switch(rq, prev);
+       trace_sched_stat_sleeptime(current, rq->clock);
+
+       fire_sched_in_preempt_notifiers(current);
+       if (mm)
+               mmdrop(mm);
+       if (unlikely(prev_state == TASK_DEAD)) {
+               /*
+                * Remove function-return probe instances associated with this
+                * task and put them back on the free list.
+                */
+               kprobe_flush_task(prev);
+               put_task_struct(prev);
+       }
+}
+
+#ifdef CONFIG_SMP
+
+/* assumes rq->lock is held */
+static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->sched_class->pre_schedule)
+               prev->sched_class->pre_schedule(rq, prev);
+}
+
+/* rq->lock is NOT held, but preemption is disabled */
+static inline void post_schedule(struct rq *rq)
+{
+       if (rq->post_schedule) {
+               unsigned long flags;
+
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               if (rq->curr->sched_class->post_schedule)
+                       rq->curr->sched_class->post_schedule(rq);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               rq->post_schedule = 0;
+       }
+}
+
+#else
+
+static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void post_schedule(struct rq *rq)
+{
+}
+
+#endif
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage void schedule_tail(struct task_struct *prev)
+       __releases(rq->lock)
+{
+       struct rq *rq = this_rq();
+
+       finish_task_switch(rq, prev);
+
+       /*
+        * FIXME: do we need to worry about rq being invalidated by the
+        * task_switch?
+        */
+       post_schedule(rq);
+
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+       /* In this case, finish_task_switch does not reenable preemption */
+       preempt_enable();
+#endif
+       if (current->set_child_tid)
+               put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new
+ * thread's register state.
+ */
+static inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+              struct task_struct *next)
+{
+       struct mm_struct *mm, *oldmm;
+
+       prepare_task_switch(rq, prev, next);
+
+       mm = next->mm;
+       oldmm = prev->active_mm;
+       /*
+        * For paravirt, this is coupled with an exit in switch_to to
+        * combine the page table reload and the switch backend into
+        * one hypercall.
+        */
+       arch_start_context_switch(prev);
+
+       if (!mm) {
+               next->active_mm = oldmm;
+               atomic_inc(&oldmm->mm_count);
+               enter_lazy_tlb(oldmm, next);
+       } else
+               switch_mm(oldmm, mm, next);
+
+       if (!prev->mm) {
+               prev->active_mm = NULL;
+               rq->prev_mm = oldmm;
+       }
+       /*
+        * Since the runqueue lock will be released by the next
+        * task (which is an invalid locking op but in the case
+        * of the scheduler it's an obvious special-case), so we
+        * do an early lockdep release here:
+        */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
+
+       /* Here we just switch the register state and the stack. */
+       switch_to(prev, next, prev);
+
+       barrier();
+       /*
+        * this_rq must be evaluated again because prev may have moved
+        * CPUs since it called schedule(), thus the 'rq' on its stack
+        * frame will be invalid.
+        */
+       finish_task_switch(this_rq(), prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, current number of uninterruptible-sleeping threads, total
+ * number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+       unsigned long i, sum = 0;
+
+       for_each_online_cpu(i)
+               sum += cpu_rq(i)->nr_running;
+
+       return sum;
+}
+
+unsigned long nr_uninterruptible(void)
+{
+       unsigned long i, sum = 0;
+
+       for_each_possible_cpu(i)
+               sum += cpu_rq(i)->nr_uninterruptible;
+
+       /*
+        * Since we read the counters lockless, it might be slightly
+        * inaccurate. Do not allow it to go below zero though:
+        */
+       if (unlikely((long)sum < 0))
+               sum = 0;
+
+       return sum;
+}
+
+unsigned long long nr_context_switches(void)
+{
+       int i;
+       unsigned long long sum = 0;
+
+       for_each_possible_cpu(i)
+               sum += cpu_rq(i)->nr_switches;
+
+       return sum;
+}
+
+unsigned long nr_iowait(void)
+{
+       unsigned long i, sum = 0;
+
+       for_each_possible_cpu(i)
+               sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+       return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+       struct rq *this = cpu_rq(cpu);
+       return atomic_read(&this->nr_iowait);
+}
+
+unsigned long this_cpu_load(void)
+{
+       struct rq *this = this_rq();
+       return this->cpu_load[0];
+}
+
+
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static long calc_load_fold_active(struct rq *this_rq)
+{
+       long nr_active, delta = 0;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+       }
+
+       return delta;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       load += 1UL << (FSHIFT - 1);
+       return load >> FSHIFT;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+void calc_load_account_idle(struct rq *this_rq)
+{
+       long delta;
+
+       delta = calc_load_fold_active(this_rq);
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+       long delta = 0;
+
+       /*
+        * Its got a race, we don't care...
+        */
+       if (atomic_long_read(&calc_load_tasks_idle))
+               delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+       return delta;
+}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+       unsigned long result = 1UL << frac_bits;
+
+       if (n) for (;;) {
+               if (n & 1) {
+                       result *= x;
+                       result += 1UL << (frac_bits - 1);
+                       result >>= frac_bits;
+               }
+               n >>= 1;
+               if (!n)
+                       break;
+               x *= x;
+               x += 1UL << (frac_bits - 1);
+               x >>= frac_bits;
+       }
+
+       return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+           unsigned long active, unsigned int n)
+{
+
+       return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+       long delta, active, n;
+
+       if (time_before(jiffies, calc_load_update))
+               return;
+
+       /*
+        * If we crossed a calc_load_update boundary, make sure to fold
+        * any pending idle changes, the respective CPUs might have
+        * missed the tick driven calc_load_account_active() update
+        * due to NO_HZ.
+        */
+       delta = calc_load_fold_idle();
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+
+       /*
+        * If we were idle for multiple load cycles, apply them.
+        */
+       if (ticks >= LOAD_FREQ) {
+               n = ticks / LOAD_FREQ;
+
+               active = atomic_long_read(&calc_load_tasks);
+               active = active > 0 ? active * FIXED_1 : 0;
+
+               avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+               avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+               avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+               calc_load_update += n * LOAD_FREQ;
+       }
+
+       /*
+        * Its possible the remainder of the above division also crosses
+        * a LOAD_FREQ period, the regular check in calc_global_load()
+        * which comes after this will take care of that.
+        *
+        * Consider us being 11 ticks before a cycle completion, and us
+        * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+        * age us 4 cycles, and the test in calc_global_load() will
+        * pick up the final one.
+        */
+}
+#else
+void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+       return 0;
+}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
+#endif
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
+
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(unsigned long ticks)
+{
+       long active;
+
+       calc_global_nohz(ticks);
+
+       if (time_before(jiffies, calc_load_update + 10))
+               return;
+
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
+
+       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+       calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+       long delta;
+
+       if (time_before(jiffies, this_rq->calc_load_update))
+               return;
+
+       delta  = calc_load_fold_active(this_rq);
+       delta += calc_load_fold_idle();
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+
+       this_rq->calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT          7
+static const unsigned char
+               degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+               degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                       {0, 0, 0, 0, 0, 0, 0, 0},
+                                       {64, 32, 8, 0, 0, 0, 0, 0},
+                                       {96, 72, 40, 12, 1, 0, 0},
+                                       {112, 98, 75, 43, 15, 1, 0},
+                                       {120, 112, 98, 76, 45, 16, 2} };
+
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+       int j = 0;
+
+       if (!missed_updates)
+               return load;
+
+       if (missed_updates >= degrade_zero_ticks[idx])
+               return 0;
+
+       if (idx == 1)
+               return load >> missed_updates;
+
+       while (missed_updates) {
+               if (missed_updates % 2)
+                       load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+
+               missed_updates >>= 1;
+               j++;
+       }
+       return load;
+}
+
+/*
+ * Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
+ */
+void update_cpu_load(struct rq *this_rq)
+{
+       unsigned long this_load = this_rq->load.weight;
+       unsigned long curr_jiffies = jiffies;
+       unsigned long pending_updates;
+       int i, scale;
+
+       this_rq->nr_load_updates++;
+
+       /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+       if (curr_jiffies == this_rq->last_load_update_tick)
+               return;
+
+       pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+       this_rq->last_load_update_tick = curr_jiffies;
+
+       /* Update our load: */
+       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+               unsigned long old_load, new_load;
+
+               /* scale is effectively 1 << i now, and >> i divides by scale */
+
+               old_load = this_rq->cpu_load[i];
+               old_load = decay_load_missed(old_load, pending_updates - 1, i);
+               new_load = this_load;
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale - 1;
+
+               this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
+       }
+
+       sched_avg_update(this_rq);
+}
+
+static void update_cpu_load_active(struct rq *this_rq)
+{
+       update_cpu_load(this_rq);
+
+       calc_load_account_active(this_rq);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
+ */
+void sched_exec(void)
+{
+       struct task_struct *p = current;
+       unsigned long flags;
+       int dest_cpu;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+       if (dest_cpu == smp_processor_id())
+               goto unlock;
+
+       if (likely(cpu_active(dest_cpu))) {
+               struct migration_arg arg = { p, dest_cpu };
+
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
+               return;
+       }
+unlock:
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#endif
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock() held on @rq.
+ */
+static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+       u64 ns = 0;
+
+       if (task_current(rq, p)) {
+               update_rq_clock(rq);
+               ns = rq->clock_task - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
+
+       return ns;
+}
+
+unsigned long long task_delta_exec(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
+
+       rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, p, &flags);
+
+       return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * In case the task is currently running, return the runtime plus current's
+ * pending runtime that have not been accounted yet.
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
+
+       rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, p, &flags);
+
+       return ns;
+}
+
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                           u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+       struct kernel_cpustat *kcpustat;
+       struct cpuacct *ca;
+#endif
+       /*
+        * Since all updates are sure to touch the root cgroup, we
+        * get ourselves ahead and touch it first. If the root cgroup
+        * is the only cgroup, then nothing else should be necessary.
+        *
+        */
+       __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       rcu_read_lock();
+       ca = task_ca(p);
+       while (ca && (ca != &root_cpuacct)) {
+               kcpustat = this_cpu_ptr(ca->cpustat);
+               kcpustat->cpustat[index] += tmp;
+               ca = parent_ca(ca);
+       }
+       rcu_read_unlock();
+#endif
+}
+
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
+{
+       int index;
+
+       /* Add user time to process. */
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
+       account_group_user_time(p, cputime);
+
+       index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+       /* Add user time to cpustat. */
+       task_group_account_field(p, index, (__force u64) cputime);
+
+       /* Account for user time used */
+       acct_update_integrals(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       /* Add guest time to process. */
+       p->utime += cputime;
+       p->utimescaled += cputime_scaled;
+       account_group_user_time(p, cputime);
+       p->gtime += cputime;
+
+       /* Add guest time to cpustat. */
+       if (TASK_NICE(p) > 0) {
+               cpustat[CPUTIME_NICE] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+       } else {
+               cpustat[CPUTIME_USER] += (__force u64) cputime;
+               cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+       }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, int index)
+{
+       /* Add system time to process. */
+       p->stime += cputime;
+       p->stimescaled += cputime_scaled;
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       task_group_account_field(p, index, (__force u64) cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+                        cputime_t cputime, cputime_t cputime_scaled)
+{
+       int index;
+
+       if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+               account_guest_time(p, cputime, cputime_scaled);
+               return;
+       }
+
+       if (hardirq_count() - hardirq_offset)
+               index = CPUTIME_IRQ;
+       else if (in_serving_softirq())
+               index = CPUTIME_SOFTIRQ;
+       else
+               index = CPUTIME_SYSTEM;
+
+       __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+       struct rq *rq = this_rq();
+
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+       else
+               cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+       if (static_branch(&paravirt_steal_enabled)) {
+               u64 steal, st = 0;
+
+               steal = paravirt_steal_clock(smp_processor_id());
+               steal -= this_rq()->prev_steal_time;
+
+               st = steal_ticks(steal);
+               this_rq()->prev_steal_time += st * TICK_NSEC;
+
+               account_steal_time(st);
+               return st;
+       }
+#endif
+       return false;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       if (steal_account_process_tick())
+               return;
+
+       if (irqtime_account_hi_update()) {
+               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+       } else if (irqtime_account_si_update()) {
+               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       CPUTIME_SOFTIRQ);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       CPUTIME_SYSTEM);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       struct rq *rq = this_rq();
+
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
+       if (steal_account_process_tick())
+               return;
+
+       if (user_tick)
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+               account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+       account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
+       account_idle_time(jiffies_to_cputime(ticks));
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       *ut = p->utime;
+       *st = p->stime;
+}
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       struct task_cputime cputime;
+
+       thread_group_cputime(p, &cputime);
+
+       *ut = cputime.utime;
+       *st = cputime.stime;
+}
+#else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)     nsecs_to_jiffies(__nsecs)
+#endif
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       cputime_t rtime, utime = p->utime, total = utime + p->stime;
+
+       /*
+        * Use CFS's precise accounting:
+        */
+       rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+
+       if (total) {
+               u64 temp = (__force u64) rtime;
+
+               temp *= (__force u64) utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
+       } else
+               utime = rtime;
+
+       /*
+        * Compare with previous values, to keep monotonicity:
+        */
+       p->prev_utime = max(p->prev_utime, utime);
+       p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+       *ut = p->prev_utime;
+       *st = p->prev_stime;
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+       struct signal_struct *sig = p->signal;
+       struct task_cputime cputime;
+       cputime_t rtime, utime, total;
+
+       thread_group_cputime(p, &cputime);
+
+       total = cputime.utime + cputime.stime;
+       rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+       if (total) {
+               u64 temp = (__force u64) rtime;
+
+               temp *= (__force u64) cputime.utime;
+               do_div(temp, (__force u32) total);
+               utime = (__force cputime_t) temp;
+       } else
+               utime = rtime;
+
+       sig->prev_utime = max(sig->prev_utime, utime);
+       sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+
+       *ut = sig->prev_utime;
+       *st = sig->prev_stime;
+}
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+       int cpu = smp_processor_id();
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *curr = rq->curr;
+
+       sched_clock_tick();
+
+       raw_spin_lock(&rq->lock);
+       update_rq_clock(rq);
+       update_cpu_load_active(rq);
+       curr->sched_class->task_tick(rq, curr, 0);
+       raw_spin_unlock(&rq->lock);
+
+       perf_event_task_tick();
+
+#ifdef CONFIG_SMP
+       rq->idle_balance = idle_cpu(cpu);
+       trigger_load_balance(rq, cpu);
+#endif
+}
+
+notrace unsigned long get_parent_ip(unsigned long addr)
+{
+       if (in_lock_functions(addr)) {
+               addr = CALLER_ADDR2;
+               if (in_lock_functions(addr))
+                       addr = CALLER_ADDR3;
+       }
+       return addr;
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                               defined(CONFIG_PREEMPT_TRACER))
+
+void __kprobes add_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       /*
+        * Underflow?
+        */
+       if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+               return;
+#endif
+       preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
+       /*
+        * Spinlock count overflowing soon?
+        */
+       DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                               PREEMPT_MASK - 10);
+#endif
+       if (preempt_count() == val)
+               trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void __kprobes sub_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+       /*
+        * Underflow?
+        */
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+               return;
+       /*
+        * Is the spinlock portion underflowing?
+        */
+       if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+                       !(preempt_count() & PREEMPT_MASK)))
+               return;
+#endif
+
+       if (preempt_count() == val)
+               trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+       preempt_count() -= val;
+}
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+       struct pt_regs *regs = get_irq_regs();
+
+       if (oops_in_progress)
+               return;
+
+       printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+               prev->comm, prev->pid, preempt_count());
+
+       debug_show_held_locks(prev);
+       print_modules();
+       if (irqs_disabled())
+               print_irqtrace_events(prev);
+
+       if (regs)
+               show_regs(regs);
+       else
+               dump_stack();
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+       /*
+        * Test if we are atomic. Since do_exit() needs to call into
+        * schedule() atomically, we ignore that path for now.
+        * Otherwise, whine if we are scheduling when we should not be.
+        */
+       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+               __schedule_bug(prev);
+       rcu_sleep_check();
+
+       profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+       schedstat_inc(this_rq(), sched_count);
+}
+
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+       if (prev->on_rq || rq->skip_clock_update < 0)
+               update_rq_clock(rq);
+       prev->sched_class->put_prev_task(rq, prev);
+}
+
+/*
+ * Pick up the highest-prio task:
+ */
+static inline struct task_struct *
+pick_next_task(struct rq *rq)
+{
+       const struct sched_class *class;
+       struct task_struct *p;
+
+       /*
+        * Optimization: we know that if all tasks are in
+        * the fair class we can call that function directly:
+        */
+       if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+               p = fair_sched_class.pick_next_task(rq);
+               if (likely(p))
+                       return p;
+       }
+
+       for_each_class(class) {
+               p = class->pick_next_task(rq);
+               if (p)
+                       return p;
+       }
+
+       BUG(); /* the idle class will always have a runnable task */
+}
+
+/*
+ * __schedule() is the main scheduler function.
+ */
+static void __sched __schedule(void)
+{
+       struct task_struct *prev, *next;
+       unsigned long *switch_count;
+       struct rq *rq;
+       int cpu;
+
+need_resched:
+       preempt_disable();
+       cpu = smp_processor_id();
+       rq = cpu_rq(cpu);
+       rcu_note_context_switch(cpu);
+       prev = rq->curr;
+
+       schedule_debug(prev);
+
+       if (sched_feat(HRTICK))
+               hrtick_clear(rq);
+
+       raw_spin_lock_irq(&rq->lock);
+
+       switch_count = &prev->nivcsw;
+       if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+               if (unlikely(signal_pending_state(prev->state, prev))) {
+                       prev->state = TASK_RUNNING;
+               } else {
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       prev->on_rq = 0;
+
+                       /*
+                        * If a worker went to sleep, notify and ask workqueue
+                        * whether it wants to wake up a task to maintain
+                        * concurrency.
+                        */
+                       if (prev->flags & PF_WQ_WORKER) {
+                               struct task_struct *to_wakeup;
+
+                               to_wakeup = wq_worker_sleeping(prev, cpu);
+                               if (to_wakeup)
+                                       try_to_wake_up_local(to_wakeup);
+                       }
+               }
+               switch_count = &prev->nvcsw;
+       }
+
+       pre_schedule(rq, prev);
+
+       if (unlikely(!rq->nr_running))
+               idle_balance(cpu, rq);
+
+       put_prev_task(rq, prev);
+       next = pick_next_task(rq);
+       clear_tsk_need_resched(prev);
+       rq->skip_clock_update = 0;
+
+       if (likely(prev != next)) {
+               rq->nr_switches++;
+               rq->curr = next;
+               ++*switch_count;
+
+               context_switch(rq, prev, next); /* unlocks the rq */
+               /*
+                * The context switch have flipped the stack from under us
+                * and restored the local variables which were saved when
+                * this task called schedule() in the past. prev == current
+                * is still correct, but it can be moved to another cpu/rq.
+                */
+               cpu = smp_processor_id();
+               rq = cpu_rq(cpu);
+       } else
+               raw_spin_unlock_irq(&rq->lock);
+
+       post_schedule(rq);
+
+       preempt_enable_no_resched();
+       if (need_resched())
+               goto need_resched;
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+       if (!tsk->state)
+               return;
+       /*
+        * If we are going to sleep and we have plugged IO queued,
+        * make sure to submit it to avoid deadlocks.
+        */
+       if (blk_needs_flush_plug(tsk))
+               blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage void __sched schedule(void)
+{
+       struct task_struct *tsk = current;
+
+       sched_submit_work(tsk);
+       __schedule();
+}
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+       if (lock->owner != owner)
+               return false;
+
+       /*
+        * Ensure we emit the owner->on_cpu, dereference _after_ checking
+        * lock->owner still matches owner, if that fails, owner might
+        * point to free()d memory, if it still matches, the rcu_read_lock()
+        * ensures the memory stays valid.
+        */
+       barrier();
+
+       return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+       if (!sched_feat(OWNER_SPIN))
+               return 0;
+
+       rcu_read_lock();
+       while (owner_running(lock, owner)) {
+               if (need_resched())
+                       break;
+
+               arch_mutex_cpu_relax();
+       }
+       rcu_read_unlock();
+
+       /*
+        * We break out the loop above on need_resched() and when the
+        * owner changed, which is a sign for heavy contention. Return
+        * success only when lock->owner is NULL.
+        */
+       return lock->owner == NULL;
+}
+#endif
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage void __sched notrace preempt_schedule(void)
+{
+       struct thread_info *ti = current_thread_info();
+
+       /*
+        * If there is a non-zero preempt_count or interrupts are disabled,
+        * we do not want to preempt the current task. Just return..
+        */
+       if (likely(ti->preempt_count || irqs_disabled()))
+               return;
+
+       do {
+               add_preempt_count_notrace(PREEMPT_ACTIVE);
+               __schedule();
+               sub_preempt_count_notrace(PREEMPT_ACTIVE);
+
+               /*
+                * Check again in case we missed a preemption opportunity
+                * between schedule and now.
+                */
+               barrier();
+       } while (need_resched());
+}
+EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+       struct thread_info *ti = current_thread_info();
+
+       /* Catch callers which need to be fixed */
+       BUG_ON(ti->preempt_count || !irqs_disabled());
+
+       do {
+               add_preempt_count(PREEMPT_ACTIVE);
+               local_irq_enable();
+               __schedule();
+               local_irq_disable();
+               sub_preempt_count(PREEMPT_ACTIVE);
+
+               /*
+                * Check again in case we missed a preemption opportunity
+                * between schedule and now.
+                */
+               barrier();
+       } while (need_resched());
+}
+
+#endif /* CONFIG_PREEMPT */
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+                         void *key)
+{
+       return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+/*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, int wake_flags, void *key)
+{
+       wait_queue_t *curr, *next;
+
+       list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+               unsigned flags = curr->flags;
+
+               if (curr->func(curr, mode, wake_flags, key) &&
+                               (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                       break;
+       }
+}
+
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, void *key)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&q->lock, flags);
+       __wake_up_common(q, mode, nr_exclusive, 0, key);
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+{
+       __wake_up_common(q, mode, 1, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+       __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+                       int nr_exclusive, void *key)
+{
+       unsigned long flags;
+       int wake_flags = WF_SYNC;
+
+       if (unlikely(!q))
+               return;
+
+       if (unlikely(!nr_exclusive))
+               wake_flags = 0;
+
+       spin_lock_irqsave(&q->lock, flags);
+       __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+       spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+       __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);     /* For internal use only */
+
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&x->wait.lock, flags);
+       x->done++;
+       __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
+       spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&x->wait.lock, flags);
+       x->done += UINT_MAX/2;
+       __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
+       spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+
+static inline long __sched
+do_wait_for_common(struct completion *x, long timeout, int state)
+{
+       if (!x->done) {
+               DECLARE_WAITQUEUE(wait, current);
+
+               __add_wait_queue_tail_exclusive(&x->wait, &wait);
+               do {
+                       if (signal_pending_state(state, current)) {
+                               timeout = -ERESTARTSYS;
+                               break;
+                       }
+                       __set_current_state(state);
+                       spin_unlock_irq(&x->wait.lock);
+                       timeout = schedule_timeout(timeout);
+                       spin_lock_irq(&x->wait.lock);
+               } while (!x->done && timeout);
+               __remove_wait_queue(&x->wait, &wait);
+               if (!x->done)
+                       return timeout;
+       }
+       x->done--;
+       return timeout ?: 1;
+}
+
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+       might_sleep();
+
+       spin_lock_irq(&x->wait.lock);
+       timeout = do_wait_for_common(x, timeout, state);
+       spin_unlock_irq(&x->wait.lock);
+       return timeout;
+}
+
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+       wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+       return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+       if (t == -ERESTARTSYS)
+               return t;
+       return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+                                         unsigned long timeout)
+{
+       return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+       long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+       if (t == -ERESTARTSYS)
+               return t;
+       return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+                                    unsigned long timeout)
+{
+       return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+
+/**
+ *     try_wait_for_completion - try to decrement a completion without blocking
+ *     @x:     completion structure
+ *
+ *     Returns: 0 if a decrement cannot be done without blocking
+ *              1 if a decrement succeeded.
+ *
+ *     If a completion is being used as a counting completion,
+ *     attempt to decrement the counter without blocking. This
+ *     enables us to avoid waiting if the resource the completion
+ *     is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+       unsigned long flags;
+       int ret = 1;
+
+       spin_lock_irqsave(&x->wait.lock, flags);
+       if (!x->done)
+               ret = 0;
+       else
+               x->done--;
+       spin_unlock_irqrestore(&x->wait.lock, flags);
+       return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ *     completion_done - Test to see if a completion has any waiters
+ *     @x:     completion structure
+ *
+ *     Returns: 0 if there are waiters (wait_for_completion() in progress)
+ *              1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+       unsigned long flags;
+       int ret = 1;
+
+       spin_lock_irqsave(&x->wait.lock, flags);
+       if (!x->done)
+               ret = 0;
+       spin_unlock_irqrestore(&x->wait.lock, flags);
+       return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
+static long __sched
+sleep_on_common(wait_queue_head_t *q, int state, long timeout)
+{
+       unsigned long flags;
+       wait_queue_t wait;
+
+       init_waitqueue_entry(&wait, current);
+
+       __set_current_state(state);
+
+       spin_lock_irqsave(&q->lock, flags);
+       __add_wait_queue(q, &wait);
+       spin_unlock(&q->lock);
+       timeout = schedule_timeout(timeout);
+       spin_lock_irq(&q->lock);
+       __remove_wait_queue(q, &wait);
+       spin_unlock_irqrestore(&q->lock, flags);
+
+       return timeout;
+}
+
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
+{
+       sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(interruptible_sleep_on);
+
+long __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+       return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(interruptible_sleep_on_timeout);
+
+void __sched sleep_on(wait_queue_head_t *q)
+{
+       sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sleep_on);
+
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+{
+       return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
+}
+EXPORT_SYMBOL(sleep_on_timeout);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+       int oldprio, on_rq, running;
+       struct rq *rq;
+       const struct sched_class *prev_class;
+
+       BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+       rq = __task_rq_lock(p);
+
+       trace_sched_pi_setprio(p, prio);
+       oldprio = p->prio;
+       prev_class = p->sched_class;
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       if (rt_prio(prio))
+               p->sched_class = &rt_sched_class;
+       else
+               p->sched_class = &fair_sched_class;
+
+       p->prio = prio;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+
+       check_class_changed(rq, p, prev_class, oldprio);
+       __task_rq_unlock(rq);
+}
+
+#endif
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+       int old_prio, delta, on_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+               return;
+       /*
+        * We have to be careful, if called from sys_setpriority(),
+        * the task might be in the middle of scheduling on another CPU.
+        */
+       rq = task_rq_lock(p, &flags);
+       /*
+        * The RT priorities are set via sched_setscheduler(), but we still
+        * allow the 'normal' nice value to be set - but as expected
+        * it wont have any effect on scheduling until the task is
+        * SCHED_FIFO/SCHED_RR:
+        */
+       if (task_has_rt_policy(p)) {
+               p->static_prio = NICE_TO_PRIO(nice);
+               goto out_unlock;
+       }
+       on_rq = p->on_rq;
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+
+       p->static_prio = NICE_TO_PRIO(nice);
+       set_load_weight(p);
+       old_prio = p->prio;
+       p->prio = effective_prio(p);
+       delta = p->prio - old_prio;
+
+       if (on_rq) {
+               enqueue_task(rq, p, 0);
+               /*
+                * If the task increased its priority or is running and
+                * lowered its priority, then reschedule its CPU:
+                */
+               if (delta < 0 || (delta > 0 && task_running(rq, p)))
+                       resched_task(rq->curr);
+       }
+out_unlock:
+       task_rq_unlock(rq, p, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+       /* convert nice value [19,-20] to rlimit style value [1,40] */
+       int nice_rlim = 20 - nice;
+
+       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+               capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+       long nice, retval;
+
+       /*
+        * Setpriority might change our priority at the same moment.
+        * We don't have to worry. Conceptually one call occurs first
+        * and we have a single winner.
+        */
+       if (increment < -40)
+               increment = -40;
+       if (increment > 40)
+               increment = 40;
+
+       nice = TASK_NICE(current) + increment;
+       if (nice < -20)
+               nice = -20;
+       if (nice > 19)
+               nice = 19;
+
+       if (increment < 0 && !can_nice(current, nice))
+               return -EPERM;
+
+       retval = security_task_setnice(current, nice);
+       if (retval)
+               return retval;
+
+       set_user_nice(current, nice);
+       return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * This is the priority value as seen by users in /proc.
+ * RT tasks are offset by -200. Normal tasks are centered
+ * around 0, value goes from -16 to +15.
+ */
+int task_prio(const struct task_struct *p)
+{
+       return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ */
+int task_nice(const struct task_struct *p)
+{
+       return TASK_NICE(p);
+}
+EXPORT_SYMBOL(task_nice);
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ */
+int idle_cpu(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (rq->curr != rq->idle)
+               return 0;
+
+       if (rq->nr_running)
+               return 0;
+
+#ifdef CONFIG_SMP
+       if (!llist_empty(&rq->wake_list))
+               return 0;
+#endif
+
+       return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ */
+struct task_struct *idle_task(int cpu)
+{
+       return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+       return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void
+__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+{
+       p->policy = policy;
+       p->rt_priority = prio;
+       p->normal_prio = normal_prio(p);
+       /* we are holding p->pi_lock already */
+       p->prio = rt_mutex_getprio(p);
+       if (rt_prio(p->prio))
+               p->sched_class = &rt_sched_class;
+       else
+               p->sched_class = &fair_sched_class;
+       set_load_weight(p);
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       if (cred->user->user_ns == pcred->user->user_ns)
+               match = (cred->euid == pcred->euid ||
+                        cred->euid == pcred->uid);
+       else
+               match = false;
+       rcu_read_unlock();
+       return match;
+}
+
+static int __sched_setscheduler(struct task_struct *p, int policy,
+                               const struct sched_param *param, bool user)
+{
+       int retval, oldprio, oldpolicy = -1, on_rq, running;
+       unsigned long flags;
+       const struct sched_class *prev_class;
+       struct rq *rq;
+       int reset_on_fork;
+
+       /* may grab non-irq protected spin_locks */
+       BUG_ON(in_interrupt());
+recheck:
+       /* double check policy once rq lock held */
+       if (policy < 0) {
+               reset_on_fork = p->sched_reset_on_fork;
+               policy = oldpolicy = p->policy;
+       } else {
+               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+               policy &= ~SCHED_RESET_ON_FORK;
+
+               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                               policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+                               policy != SCHED_IDLE)
+                       return -EINVAL;
+       }
+
+       /*
+        * Valid priorities for SCHED_FIFO and SCHED_RR are
+        * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+        * SCHED_BATCH and SCHED_IDLE is 0.
+        */
+       if (param->sched_priority < 0 ||
+           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+               return -EINVAL;
+       if (rt_policy(policy) != (param->sched_priority != 0))
+               return -EINVAL;
+
+       /*
+        * Allow unprivileged RT tasks to decrease priority:
+        */
+       if (user && !capable(CAP_SYS_NICE)) {
+               if (rt_policy(policy)) {
+                       unsigned long rlim_rtprio =
+                                       task_rlimit(p, RLIMIT_RTPRIO);
+
+                       /* can't set/change the rt policy */
+                       if (policy != p->policy && !rlim_rtprio)
+                               return -EPERM;
+
+                       /* can't increase priority */
+                       if (param->sched_priority > p->rt_priority &&
+                           param->sched_priority > rlim_rtprio)
+                               return -EPERM;
+               }
+
+               /*
+                * Treat SCHED_IDLE as nice 20. Only allow a switch to
+                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+                */
+               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                       if (!can_nice(p, TASK_NICE(p)))
+                               return -EPERM;
+               }
+
+               /* can't change other user's priorities */
+               if (!check_same_owner(p))
+                       return -EPERM;
+
+               /* Normal users shall not reset the sched_reset_on_fork flag */
+               if (p->sched_reset_on_fork && !reset_on_fork)
+                       return -EPERM;
+       }
+
+       if (user) {
+               retval = security_task_setscheduler(p);
+               if (retval)
+                       return retval;
+       }
+
+       /*
+        * make sure no PI-waiters arrive (or leave) while we are
+        * changing the priority of the task:
+        *
+        * To be able to change p->policy safely, the appropriate
+        * runqueue lock must be held.
+        */
+       rq = task_rq_lock(p, &flags);
+
+       /*
+        * Changing the policy of the stop threads its a very bad idea
+        */
+       if (p == rq->stop) {
+               task_rq_unlock(rq, p, &flags);
+               return -EINVAL;
+       }
+
+       /*
+        * If not changing anything there's no need to proceed further:
+        */
+       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                       param->sched_priority == p->rt_priority))) {
+
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return 0;
+       }
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (user) {
+               /*
+                * Do not allow realtime tasks into groups that have no runtime
+                * assigned.
+                */
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+                               !task_group_is_autogroup(task_group(p))) {
+                       task_rq_unlock(rq, p, &flags);
+                       return -EPERM;
+               }
+       }
+#endif
+
+       /* recheck policy now with rq lock held */
+       if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+               policy = oldpolicy = -1;
+               task_rq_unlock(rq, p, &flags);
+               goto recheck;
+       }
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+       if (on_rq)
+               deactivate_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->sched_reset_on_fork = reset_on_fork;
+
+       oldprio = p->prio;
+       prev_class = p->sched_class;
+       __setscheduler(rq, p, policy, param->sched_priority);
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               activate_task(rq, p, 0);
+
+       check_class_changed(rq, p, prev_class, oldprio);
+       task_rq_unlock(rq, p, &flags);
+
+       rt_mutex_adjust_pi(p);
+
+       return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+                      const struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                              const struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, false);
+}
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+       struct sched_param lparam;
+       struct task_struct *p;
+       int retval;
+
+       if (!param || pid < 0)
+               return -EINVAL;
+       if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+               return -EFAULT;
+
+       rcu_read_lock();
+       retval = -ESRCH;
+       p = find_process_by_pid(pid);
+       if (p != NULL)
+               retval = sched_setscheduler(p, policy, &lparam);
+       rcu_read_unlock();
+
+       return retval;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+               struct sched_param __user *, param)
+{
+       /* negative values for policy are not valid */
+       if (policy < 0)
+               return -EINVAL;
+
+       return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+       return do_sched_setscheduler(pid, -1, param);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+       struct task_struct *p;
+       int retval;
+
+       if (pid < 0)
+               return -EINVAL;
+
+       retval = -ESRCH;
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       if (p) {
+               retval = security_task_getscheduler(p);
+               if (!retval)
+                       retval = p->policy
+                               | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
+       }
+       rcu_read_unlock();
+       return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+       struct sched_param lp;
+       struct task_struct *p;
+       int retval;
+
+       if (!param || pid < 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       retval = -ESRCH;
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       lp.sched_priority = p->rt_priority;
+       rcu_read_unlock();
+
+       /*
+        * This one might sleep, we cannot do it with a spinlock held ...
+        */
+       retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+       return retval;
+
+out_unlock:
+       rcu_read_unlock();
+       return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+       cpumask_var_t cpus_allowed, new_mask;
+       struct task_struct *p;
+       int retval;
+
+       get_online_cpus();
+       rcu_read_lock();
+
+       p = find_process_by_pid(pid);
+       if (!p) {
+               rcu_read_unlock();
+               put_online_cpus();
+               return -ESRCH;
+       }
+
+       /* Prevent p going away */
+       get_task_struct(p);
+       rcu_read_unlock();
+
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
+       retval = -EPERM;
+       if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+               goto out_unlock;
+
+       retval = security_task_setscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+       retval = set_cpus_allowed_ptr(p, new_mask);
+
+       if (!retval) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
+                       /*
+                        * We must have raced with a concurrent cpuset
+                        * update. Just reset the cpus_allowed to the
+                        * cpuset's cpus_allowed
+                        */
+                       cpumask_copy(new_mask, cpus_allowed);
+                       goto again;
+               }
+       }
+out_unlock:
+       free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+out_put_task:
+       put_task_struct(p);
+       put_online_cpus();
+       return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+                            struct cpumask *new_mask)
+{
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+
+       return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
+{
+       cpumask_var_t new_mask;
+       int retval;
+
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
+
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+       struct task_struct *p;
+       unsigned long flags;
+       int retval;
+
+       get_online_cpus();
+       rcu_read_lock();
+
+       retval = -ESRCH;
+       p = find_process_by_pid(pid);
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+       rcu_read_unlock();
+       put_online_cpus();
+
+       return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
+{
+       int ret;
+       cpumask_var_t mask;
+
+       if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+               return -EINVAL;
+       if (len & (sizeof(unsigned long)-1))
+               return -EINVAL;
+
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               size_t retlen = min_t(size_t, len, cpumask_size());
+
+               if (copy_to_user(user_mask_ptr, mask, retlen))
+                       ret = -EFAULT;
+               else
+                       ret = retlen;
+       }
+       free_cpumask_var(mask);
+
+       return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+       struct rq *rq = this_rq_lock();
+
+       schedstat_inc(rq, yld_count);
+       current->sched_class->yield_task(rq);
+
+       /*
+        * Since we are going to call schedule() anyway, there's
+        * no need to preempt or enable interrupts:
+        */
+       __release(rq->lock);
+       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+       do_raw_spin_unlock(&rq->lock);
+       preempt_enable_no_resched();
+
+       schedule();
+
+       return 0;
+}
+
+static inline int should_resched(void)
+{
+       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
+}
+
+static void __cond_resched(void)
+{
+       add_preempt_count(PREEMPT_ACTIVE);
+       __schedule();
+       sub_preempt_count(PREEMPT_ACTIVE);
+}
+
+int __sched _cond_resched(void)
+{
+       if (should_resched()) {
+               __cond_resched();
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+       int resched = should_resched();
+       int ret = 0;
+
+       lockdep_assert_held(lock);
+
+       if (spin_needbreak(lock) || resched) {
+               spin_unlock(lock);
+               if (resched)
+                       __cond_resched();
+               else
+                       cpu_relax();
+               ret = 1;
+               spin_lock(lock);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+       BUG_ON(!in_softirq());
+
+       if (should_resched()) {
+               local_bh_enable();
+               __cond_resched();
+               local_bh_disable();
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * This is a shortcut for kernel-space yielding - it marks the
+ * thread runnable and calls sys_sched_yield().
+ */
+void __sched yield(void)
+{
+       set_current_state(TASK_RUNNING);
+       sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+       struct task_struct *curr = current;
+       struct rq *rq, *p_rq;
+       unsigned long flags;
+       bool yielded = 0;
+
+       local_irq_save(flags);
+       rq = this_rq();
+
+again:
+       p_rq = task_rq(p);
+       double_rq_lock(rq, p_rq);
+       while (task_rq(p) != p_rq) {
+               double_rq_unlock(rq, p_rq);
+               goto again;
+       }
+
+       if (!curr->sched_class->yield_to_task)
+               goto out;
+
+       if (curr->sched_class != p->sched_class)
+               goto out;
+
+       if (task_running(p_rq, p) || p->state)
+               goto out;
+
+       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+       if (yielded) {
+               schedstat_inc(rq, yld_count);
+               /*
+                * Make p's CPU reschedule; pick_next_entity takes care of
+                * fairness.
+                */
+               if (preempt && rq != p_rq)
+                       resched_task(p_rq->curr);
+       } else {
+               /*
+                * We might have set it in task_yield_fair(), but are
+                * not going to schedule(), so don't want to skip
+                * the next update.
+                */
+               rq->skip_clock_update = 0;
+       }
+
+out:
+       double_rq_unlock(rq, p_rq);
+       local_irq_restore(flags);
+
+       if (yielded)
+               schedule();
+
+       return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ */
+void __sched io_schedule(void)
+{
+       struct rq *rq = raw_rq();
+
+       delayacct_blkio_start();
+       atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
+       current->in_iowait = 1;
+       schedule();
+       current->in_iowait = 0;
+       atomic_dec(&rq->nr_iowait);
+       delayacct_blkio_end();
+}
+EXPORT_SYMBOL(io_schedule);
+
+long __sched io_schedule_timeout(long timeout)
+{
+       struct rq *rq = raw_rq();
+       long ret;
+
+       delayacct_blkio_start();
+       atomic_inc(&rq->nr_iowait);
+       blk_flush_plug(current);
+       current->in_iowait = 1;
+       ret = schedule_timeout(timeout);
+       current->in_iowait = 0;
+       atomic_dec(&rq->nr_iowait);
+       delayacct_blkio_end();
+       return ret;
+}
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the maximum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+       int ret = -EINVAL;
+
+       switch (policy) {
+       case SCHED_FIFO:
+       case SCHED_RR:
+               ret = MAX_USER_RT_PRIO-1;
+               break;
+       case SCHED_NORMAL:
+       case SCHED_BATCH:
+       case SCHED_IDLE:
+               ret = 0;
+               break;
+       }
+       return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * this syscall returns the minimum rt_priority that can be used
+ * by a given scheduling class.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+       int ret = -EINVAL;
+
+       switch (policy) {
+       case SCHED_FIFO:
+       case SCHED_RR:
+               ret = 1;
+               break;
+       case SCHED_NORMAL:
+       case SCHED_BATCH:
+       case SCHED_IDLE:
+               ret = 0;
+       }
+       return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+               struct timespec __user *, interval)
+{
+       struct task_struct *p;
+       unsigned int time_slice;
+       unsigned long flags;
+       struct rq *rq;
+       int retval;
+       struct timespec t;
+
+       if (pid < 0)
+               return -EINVAL;
+
+       retval = -ESRCH;
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       rq = task_rq_lock(p, &flags);
+       time_slice = p->sched_class->get_rr_interval(rq, p);
+       task_rq_unlock(rq, p, &flags);
+
+       rcu_read_unlock();
+       jiffies_to_timespec(time_slice, &t);
+       retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+       return retval;
+
+out_unlock:
+       rcu_read_unlock();
+       return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+       unsigned long free = 0;
+       unsigned state;
+
+       state = p->state ? __ffs(p->state) + 1 : 0;
+       printk(KERN_INFO "%-15.15s %c", p->comm,
+               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+       if (state == TASK_RUNNING)
+               printk(KERN_CONT " running  ");
+       else
+               printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+       if (state == TASK_RUNNING)
+               printk(KERN_CONT "  running task    ");
+       else
+               printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+       free = stack_not_used(p);
+#endif
+       printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+               task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+               (unsigned long)task_thread_info(p)->flags);
+
+       show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+       struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+       printk(KERN_INFO
+               "  task                PC stack   pid father\n");
+#else
+       printk(KERN_INFO
+               "  task                        PC stack   pid father\n");
+#endif
+       rcu_read_lock();
+       do_each_thread(g, p) {
+               /*
+                * reset the NMI-timeout, listing all files on a slow
+                * console might take a lot of time:
+                */
+               touch_nmi_watchdog();
+               if (!state_filter || (p->state & state_filter))
+                       sched_show_task(p);
+       } while_each_thread(g, p);
+
+       touch_all_softlockup_watchdogs();
+
+#ifdef CONFIG_SCHED_DEBUG
+       sysrq_sched_debug_show();
+#endif
+       rcu_read_unlock();
+       /*
+        * Only show locks if all tasks are dumped:
+        */
+       if (!state_filter)
+               debug_show_all_locks();
+}
+
+void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+{
+       idle->sched_class = &idle_sched_class;
+}
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       __sched_fork(idle);
+       idle->state = TASK_RUNNING;
+       idle->se.exec_start = sched_clock();
+
+       do_set_cpus_allowed(idle, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
+       __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
+
+       rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP)
+       idle->on_cpu = 1;
+#endif
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       /* Set the preempt count _outside_ the spinlocks! */
+       task_thread_info(idle)->preempt_count = 0;
+
+       /*
+        * The idle tasks have their own, simple scheduling class:
+        */
+       idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+       sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+       if (p->sched_class && p->sched_class->set_cpus_allowed)
+               p->sched_class->set_cpus_allowed(p, new_mask);
+
+       cpumask_copy(&p->cpus_allowed, new_mask);
+       p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ *    it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
+ */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+       unsigned long flags;
+       struct rq *rq;
+       unsigned int dest_cpu;
+       int ret = 0;
+
+       rq = task_rq_lock(p, &flags);
+
+       if (cpumask_equal(&p->cpus_allowed, new_mask))
+               goto out;
+
+       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       do_set_cpus_allowed(p, new_mask);
+
+       /* Can the task run on the task's current CPU? If so, we're done */
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
+               goto out;
+
+       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       if (p->on_rq) {
+               struct migration_arg arg = { p, dest_cpu };
+               /* Need help from migration thread: drop lock and wait. */
+               task_rq_unlock(rq, p, &flags);
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+               tlb_migrate_finish(p->mm);
+               return 0;
+       }
+out:
+       task_rq_unlock(rq, p, &flags);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ *
+ * Returns non-zero if task was successfully migrated.
+ */
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+{
+       struct rq *rq_dest, *rq_src;
+       int ret = 0;
+
+       if (unlikely(!cpu_active(dest_cpu)))
+               return ret;
+
+       rq_src = cpu_rq(src_cpu);
+       rq_dest = cpu_rq(dest_cpu);
+
+       raw_spin_lock(&p->pi_lock);
+       double_rq_lock(rq_src, rq_dest);
+       /* Already moved. */
+       if (task_cpu(p) != src_cpu)
+               goto done;
+       /* Affinity changed (again). */
+       if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+               goto fail;
+
+       /*
+        * If we're not on a rq, the next wake-up will ensure we're
+        * placed properly.
+        */
+       if (p->on_rq) {
+               deactivate_task(rq_src, p, 0);
+               set_task_cpu(p, dest_cpu);
+               activate_task(rq_dest, p, 0);
+               check_preempt_curr(rq_dest, p, 0);
+       }
+done:
+       ret = 1;
+fail:
+       double_rq_unlock(rq_src, rq_dest);
+       raw_spin_unlock(&p->pi_lock);
+       return ret;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+       struct migration_arg *arg = data;
+
+       /*
+        * The original target cpu might have gone down and we might
+        * be on another cpu but it doesn't matter.
+        */
+       local_irq_disable();
+       __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+       local_irq_enable();
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+       struct mm_struct *mm = current->active_mm;
+
+       BUG_ON(cpu_online(smp_processor_id()));
+
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
+}
+
+/*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+ * for performance reasons the counter is not stricly tracking tasks to
+ * their home CPUs. So we just add the counter to another CPU's counter,
+ * to keep the global sum constant after CPU-down:
+ */
+static void migrate_nr_uninterruptible(struct rq *rq_src)
+{
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
+
+       rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
+       rq_src->nr_uninterruptible = 0;
+}
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
+}
+
+/*
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
+ */
+static void migrate_tasks(unsigned int dead_cpu)
+{
+       struct rq *rq = cpu_rq(dead_cpu);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
+
+       /*
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
+        */
+       rq->stop = NULL;
+
+       /* Ensure any throttled groups are reachable by pick_next_task */
+       unthrottle_offline_cfs_rqs(rq);
+
+       for ( ; ; ) {
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
+                       break;
+
+               next = pick_next_task(rq);
+               BUG_ON(!next);
+               next->sched_class->put_prev_task(rq, next);
+
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
+       }
+
+       rq->stop = stop;
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+       {
+               .procname       = "sched_domain",
+               .mode           = 0555,
+       },
+       {}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+       {
+               .procname       = "kernel",
+               .mode           = 0555,
+               .child          = sd_ctl_dir,
+       },
+       {}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+       struct ctl_table *entry =
+               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+       return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+       struct ctl_table *entry;
+
+       /*
+        * In the intermediate directories, both the child directory and
+        * procname are dynamically allocated and could fail but the mode
+        * will always be set. In the lowest directory the names are
+        * static strings and all have proc handlers.
+        */
+       for (entry = *tablep; entry->mode; entry++) {
+               if (entry->child)
+                       sd_free_ctl_entry(&entry->child);
+               if (entry->proc_handler == NULL)
+                       kfree(entry->procname);
+       }
+
+       kfree(*tablep);
+       *tablep = NULL;
+}
+
+static void
+set_table_entry(struct ctl_table *entry,
+               const char *procname, void *data, int maxlen,
+               mode_t mode, proc_handler *proc_handler)
+{
+       entry->procname = procname;
+       entry->data = data;
+       entry->maxlen = maxlen;
+       entry->mode = mode;
+       entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "min_interval", &sd->min_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax);
+       set_table_entry(&table[1], "max_interval", &sd->max_interval,
+               sizeof(long), 0644, proc_doulongvec_minmax);
+       set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[9], "cache_nice_tries",
+               &sd->cache_nice_tries,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[10], "flags", &sd->flags,
+               sizeof(int), 0644, proc_dointvec_minmax);
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
+
+       return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+       struct ctl_table *entry, *table;
+       struct sched_domain *sd;
+       int domain_num = 0, i;
+       char buf[32];
+
+       for_each_domain(cpu, sd)
+               domain_num++;
+       entry = table = sd_alloc_ctl_entry(domain_num + 1);
+       if (table == NULL)
+               return NULL;
+
+       i = 0;
+       for_each_domain(cpu, sd) {
+               snprintf(buf, 32, "domain%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_domain_table(sd);
+               entry++;
+               i++;
+       }
+       return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void register_sched_domain_sysctl(void)
+{
+       int i, cpu_num = num_possible_cpus();
+       struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+       char buf[32];
+
+       WARN_ON(sd_ctl_dir[0].child);
+       sd_ctl_dir[0].child = entry;
+
+       if (entry == NULL)
+               return;
+
+       for_each_possible_cpu(i) {
+               snprintf(buf, 32, "cpu%d", i);
+               entry->procname = kstrdup(buf, GFP_KERNEL);
+               entry->mode = 0555;
+               entry->child = sd_alloc_ctl_cpu_table(i);
+               entry++;
+       }
+
+       WARN_ON(sd_sysctl_header);
+       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+static void unregister_sched_domain_sysctl(void)
+{
+       if (sd_sysctl_header)
+               unregister_sysctl_table(sd_sysctl_header);
+       sd_sysctl_header = NULL;
+       if (sd_ctl_dir[0].child)
+               sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#else
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+static void set_rq_online(struct rq *rq)
+{
+       if (!rq->online) {
+               const struct sched_class *class;
+
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
+               rq->online = 1;
+
+               for_each_class(class) {
+                       if (class->rq_online)
+                               class->rq_online(rq);
+               }
+       }
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+       if (rq->online) {
+               const struct sched_class *class;
+
+               for_each_class(class) {
+                       if (class->rq_offline)
+                               class->rq_offline(rq);
+               }
+
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
+               rq->online = 0;
+       }
+}
+
+/*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+       int cpu = (long)hcpu;
+       unsigned long flags;
+       struct rq *rq = cpu_rq(cpu);
+
+       switch (action & ~CPU_TASKS_FROZEN) {
+
+       case CPU_UP_PREPARE:
+               rq->calc_load_update = calc_load_update;
+               break;
+
+       case CPU_ONLINE:
+               /* Update our root-domain */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               if (rq->rd) {
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+
+                       set_rq_online(rq);
+               }
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       case CPU_DYING:
+               sched_ttwu_pending();
+               /* Update our root-domain */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               if (rq->rd) {
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+                       set_rq_offline(rq);
+               }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
+               break;
+#endif
+       }
+
+       update_max_interval();
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_event subsystem, though.
+ */
+static struct notifier_block __cpuinitdata migration_notifier = {
+       .notifier_call = migration_call,
+       .priority = CPU_PRI_MIGRATION,
+};
+
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+                                     unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               set_cpu_active((long)hcpu, true);
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+                                       unsigned long action, void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               set_cpu_active((long)hcpu, false);
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int __init migration_init(void)
+{
+       void *cpu = (void *)(long)smp_processor_id();
+       int err;
+
+       /* Initialize migration for the boot CPU */
+       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+       BUG_ON(err == NOTIFY_BAD);
+       migration_call(&migration_notifier, CPU_ONLINE, cpu);
+       register_cpu_notifier(&migration_notifier);
+
+       /* Register cpu active notifiers */
+       cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+       cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+       return 0;
+}
+early_initcall(migration_init);
+#endif
+
+#ifdef CONFIG_SMP
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_domain_debug_enabled;
+
+static int __init sched_domain_debug_setup(char *str)
+{
+       sched_domain_debug_enabled = 1;
+
+       return 0;
+}
+early_param("sched_debug", sched_domain_debug_setup);
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+                                 struct cpumask *groupmask)
+{
+       struct sched_group *group = sd->groups;
+       char str[256];
+
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
+
+       printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+       if (!(sd->flags & SD_LOAD_BALANCE)) {
+               printk("does not load-balance\n");
+               if (sd->parent)
+                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                       " has parent");
+               return -1;
+       }
+
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
+
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+               printk(KERN_ERR "ERROR: domain->span does not contain "
+                               "CPU%d\n", cpu);
+       }
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+               printk(KERN_ERR "ERROR: domain->groups does not contain"
+                               " CPU%d\n", cpu);
+       }
+
+       printk(KERN_DEBUG "%*s groups:", level + 1, "");
+       do {
+               if (!group) {
+                       printk("\n");
+                       printk(KERN_ERR "ERROR: group is NULL\n");
+                       break;
+               }
+
+               if (!group->sgp->power) {
+                       printk(KERN_CONT "\n");
+                       printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                       "set\n");
+                       break;
+               }
+
+               if (!cpumask_weight(sched_group_cpus(group))) {
+                       printk(KERN_CONT "\n");
+                       printk(KERN_ERR "ERROR: empty group\n");
+                       break;
+               }
+
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+                       printk(KERN_CONT "\n");
+                       printk(KERN_ERR "ERROR: repeated CPUs\n");
+                       break;
+               }
+
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+
+               printk(KERN_CONT " %s", str);
+               if (group->sgp->power != SCHED_POWER_SCALE) {
+                       printk(KERN_CONT " (cpu_power = %d)",
+                               group->sgp->power);
+               }
+
+               group = group->next;
+       } while (group != sd->groups);
+       printk(KERN_CONT "\n");
+
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
+               printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+               printk(KERN_ERR "ERROR: parent span is not a superset "
+                       "of domain->span\n");
+       return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+       int level = 0;
+
+       if (!sched_domain_debug_enabled)
+               return;
+
+       if (!sd) {
+               printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+               return;
+       }
+
+       printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+       for (;;) {
+               if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+                       break;
+               level++;
+               sd = sd->parent;
+               if (!sd)
+                       break;
+       }
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
+               return 1;
+
+       /* Following flags need at least 2 groups */
+       if (sd->flags & (SD_LOAD_BALANCE |
+                        SD_BALANCE_NEWIDLE |
+                        SD_BALANCE_FORK |
+                        SD_BALANCE_EXEC |
+                        SD_SHARE_CPUPOWER |
+                        SD_SHARE_PKG_RESOURCES)) {
+               if (sd->groups != sd->groups->next)
+                       return 0;
+       }
+
+       /* Following flags don't use groups */
+       if (sd->flags & (SD_WAKE_AFFINE))
+               return 0;
+
+       return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+       unsigned long cflags = sd->flags, pflags = parent->flags;
+
+       if (sd_degenerate(parent))
+               return 1;
+
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+               return 0;
+
+       /* Flags needing groups don't count if only 1 group in parent */
+       if (parent->groups == parent->groups->next) {
+               pflags &= ~(SD_LOAD_BALANCE |
+                               SD_BALANCE_NEWIDLE |
+                               SD_BALANCE_FORK |
+                               SD_BALANCE_EXEC |
+                               SD_SHARE_CPUPOWER |
+                               SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
+       }
+       if (~cflags & pflags)
+               return 0;
+
+       return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+       struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+       cpupri_cleanup(&rd->cpupri);
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+       struct root_domain *old_rd = NULL;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       if (rq->rd) {
+               old_rd = rq->rd;
+
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
+                       set_rq_offline(rq);
+
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+               /*
+                * If we dont want to free the old_rt yet then
+                * set old_rd to NULL to skip the freeing later
+                * in this function:
+                */
+               if (!atomic_dec_and_test(&old_rd->refcount))
+                       old_rd = NULL;
+       }
+
+       atomic_inc(&rd->refcount);
+       rq->rd = rd;
+
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+               set_rq_online(rq);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       if (old_rd)
+               call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+       memset(rd, 0, sizeof(*rd));
+
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+
+       if (cpupri_init(&rd->cpupri) != 0)
+               goto free_rto_mask;
+       return 0;
+
+free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+free_online:
+       free_cpumask_var(rd->online);
+free_span:
+       free_cpumask_var(rd->span);
+out:
+       return -ENOMEM;
+}
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
+static void init_defrootdomain(void)
+{
+       init_rootdomain(&def_root_domain);
+
+       atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+       struct root_domain *rd;
+
+       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+       if (!rd)
+               return NULL;
+
+       if (init_rootdomain(rd) != 0) {
+               kfree(rd);
+               return NULL;
+       }
+
+       return rd;
+}
+
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+       struct sched_group *tmp, *first;
+
+       if (!sg)
+               return;
+
+       first = sg;
+       do {
+               tmp = sg->next;
+
+               if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                       kfree(sg->sgp);
+
+               kfree(sg);
+               sg = tmp;
+       } while (sg != first);
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+       struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+       /*
+        * If its an overlapping domain it has private groups, iterate and
+        * nuke them all.
+        */
+       if (sd->flags & SD_OVERLAP) {
+               free_sched_groups(sd->groups, 1);
+       } else if (atomic_dec_and_test(&sd->groups->ref)) {
+               kfree(sd->groups->sgp);
+               kfree(sd->groups);
+       }
+       kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+       call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+       for (; sd; sd = sd->parent)
+               destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see ttwu_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+
+static void update_top_cache_domain(int cpu)
+{
+       struct sched_domain *sd;
+       int id = cpu;
+
+       sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+       if (sd)
+               id = cpumask_first(sched_domain_span(sd));
+
+       rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+       per_cpu(sd_llc_id, cpu) = id;
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct sched_domain *tmp;
+
+       /* Remove the sched domains which do not contribute to scheduling. */
+       for (tmp = sd; tmp; ) {
+               struct sched_domain *parent = tmp->parent;
+               if (!parent)
+                       break;
+
+               if (sd_parent_degenerate(tmp, parent)) {
+                       tmp->parent = parent->parent;
+                       if (parent->parent)
+                               parent->parent->child = tmp;
+                       destroy_sched_domain(parent, cpu);
+               } else
+                       tmp = tmp->parent;
+       }
+
+       if (sd && sd_degenerate(sd)) {
+               tmp = sd;
+               sd = sd->parent;
+               destroy_sched_domain(tmp, cpu);
+               if (sd)
+                       sd->child = NULL;
+       }
+
+       sched_domain_debug(sd, cpu);
+
+       rq_attach_root(rq, rd);
+       tmp = rq->sd;
+       rcu_assign_pointer(rq->sd, sd);
+       destroy_sched_domains(tmp, cpu);
+
+       update_top_cache_domain(cpu);
+}
+
+/* cpus with isolated domains */
+static cpumask_var_t cpu_isolated_map;
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
+       return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, nodemask_t *used_nodes)
+{
+       int i, n, val, min_val, best_node = -1;
+
+       min_val = INT_MAX;
+
+       for (i = 0; i < nr_node_ids; i++) {
+               /* Start at @node */
+               n = (node + i) % nr_node_ids;
+
+               if (!nr_cpus_node(n))
+                       continue;
+
+               /* Skip already used nodes */
+               if (node_isset(n, *used_nodes))
+                       continue;
+
+               /* Simple min distance search */
+               val = node_distance(node, n);
+
+               if (val < min_val) {
+                       min_val = val;
+                       best_node = n;
+               }
+       }
+
+       if (best_node != -1)
+               node_set(best_node, *used_nodes);
+       return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @span: resulting cpumask
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static void sched_domain_node_span(int node, struct cpumask *span)
+{
+       nodemask_t used_nodes;
+       int i;
+
+       cpumask_clear(span);
+       nodes_clear(used_nodes);
+
+       cpumask_or(span, span, cpumask_of_node(node));
+       node_set(node, used_nodes);
+
+       for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+               int next_node = find_next_best_node(node, &used_nodes);
+               if (next_node < 0)
+                       break;
+               cpumask_or(span, span, cpumask_of_node(next_node));
+       }
+}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+       lockdep_assert_held(&sched_domains_mutex);
+
+       sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+       return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+       return cpu_possible_mask;
+}
+#endif /* CONFIG_NUMA */
+
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+       return cpumask_of_node(cpu_to_node(cpu));
+}
+
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
+struct sd_data {
+       struct sched_domain **__percpu sd;
+       struct sched_group **__percpu sg;
+       struct sched_group_power **__percpu sgp;
+};
+
+struct s_data {
+       struct sched_domain ** __percpu sd;
+       struct root_domain      *rd;
+};
+
+enum s_alloc {
+       sa_rootdomain,
+       sa_sd,
+       sa_sd_storage,
+       sa_none,
+};
+
+struct sched_domain_topology_level;
+
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+
+#define SDTL_OVERLAP   0x01
+
+struct sched_domain_topology_level {
+       sched_domain_init_f init;
+       sched_domain_mask_f mask;
+       int                 flags;
+       struct sd_data      data;
+};
+
+static int
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
+{
+       struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+       const struct cpumask *span = sched_domain_span(sd);
+       struct cpumask *covered = sched_domains_tmpmask;
+       struct sd_data *sdd = sd->private;
+       struct sched_domain *child;
+       int i;
+
+       cpumask_clear(covered);
+
+       for_each_cpu(i, span) {
+               struct cpumask *sg_span;
+
+               if (cpumask_test_cpu(i, covered))
+                       continue;
+
+               sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                               GFP_KERNEL, cpu_to_node(cpu));
+
+               if (!sg)
+                       goto fail;
+
+               sg_span = sched_group_cpus(sg);
+
+               child = *per_cpu_ptr(sdd->sd, i);
+               if (child->child) {
+                       child = child->child;
+                       cpumask_copy(sg_span, sched_domain_span(child));
+               } else
+                       cpumask_set_cpu(i, sg_span);
+
+               cpumask_or(covered, covered, sg_span);
+
+               sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+               atomic_inc(&sg->sgp->ref);
+
+               if (cpumask_test_cpu(cpu, sg_span))
+                       groups = sg;
+
+               if (!first)
+                       first = sg;
+               if (last)
+                       last->next = sg;
+               last = sg;
+               last->next = first;
+       }
+       sd->groups = groups;
+
+       return 0;
+
+fail:
+       free_sched_groups(first, 0);
+
+       return -ENOMEM;
+}
+
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+{
+       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+       struct sched_domain *child = sd->child;
+
+       if (child)
+               cpu = cpumask_first(sched_domain_span(child));
+
+       if (sg) {
+               *sg = *per_cpu_ptr(sdd->sg, cpu);
+               (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+               atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+       }
+
+       return cpu;
+}
+
+/*
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
+ */
+static int
+build_sched_groups(struct sched_domain *sd, int cpu)
+{
+       struct sched_group *first = NULL, *last = NULL;
+       struct sd_data *sdd = sd->private;
+       const struct cpumask *span = sched_domain_span(sd);
+       struct cpumask *covered;
+       int i;
+
+       get_group(cpu, sdd, &sd->groups);
+       atomic_inc(&sd->groups->ref);
+
+       if (cpu != cpumask_first(sched_domain_span(sd)))
+               return 0;
+
+       lockdep_assert_held(&sched_domains_mutex);
+       covered = sched_domains_tmpmask;
+
+       cpumask_clear(covered);
+
+       for_each_cpu(i, span) {
+               struct sched_group *sg;
+               int group = get_group(i, sdd, &sg);
+               int j;
+
+               if (cpumask_test_cpu(i, covered))
+                       continue;
+
+               cpumask_clear(sched_group_cpus(sg));
+               sg->sgp->power = 0;
+
+               for_each_cpu(j, span) {
+                       if (get_group(j, sdd, NULL) != group)
+                               continue;
+
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
+               }
+
+               if (!first)
+                       first = sg;
+               if (last)
+                       last->next = sg;
+               last = sg;
+       }
+       last->next = first;
+
+       return 0;
+}
+
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+       struct sched_group *sg = sd->groups;
+
+       WARN_ON(!sd || !sg);
+
+       do {
+               sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+               sg = sg->next;
+       } while (sg != sd->groups);
+
+       if (cpu != group_first_cpu(sg))
+               return;
+
+       update_group_power(sd, cpu);
+       atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)                sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)                do { } while (0)
+#endif
+
+#define SD_INIT_FUNC(type)                                             \
+static noinline struct sched_domain *                                  \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu)        \
+{                                                                      \
+       struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
+       *sd = SD_##type##_INIT;                                         \
+       SD_INIT_NAME(sd, type);                                         \
+       sd->private = &tl->data;                                        \
+       return sd;                                                      \
+}
+
+SD_INIT_FUNC(CPU)
+#ifdef CONFIG_NUMA
+ SD_INIT_FUNC(ALLNODES)
+ SD_INIT_FUNC(NODE)
+#endif
+#ifdef CONFIG_SCHED_SMT
+ SD_INIT_FUNC(SIBLING)
+#endif
+#ifdef CONFIG_SCHED_MC
+ SD_INIT_FUNC(MC)
+#endif
+#ifdef CONFIG_SCHED_BOOK
+ SD_INIT_FUNC(BOOK)
+#endif
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+       unsigned long val;
+
+       val = simple_strtoul(str, NULL, 0);
+       if (val < sched_domain_level_max)
+               default_relax_domain_level = val;
+
+       return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+                                struct sched_domain_attr *attr)
+{
+       int request;
+
+       if (!attr || attr->relax_domain_level < 0) {
+               if (default_relax_domain_level < 0)
+                       return;
+               else
+                       request = default_relax_domain_level;
+       } else
+               request = attr->relax_domain_level;
+       if (request < sd->level) {
+               /* turn off idle balance on this domain */
+               sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+       } else {
+               /* turn on idle balance on this domain */
+               sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+       }
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+                                const struct cpumask *cpu_map)
+{
+       switch (what) {
+       case sa_rootdomain:
+               if (!atomic_read(&d->rd->refcount))
+                       free_rootdomain(&d->rd->rcu); /* fall through */
+       case sa_sd:
+               free_percpu(d->sd); /* fall through */
+       case sa_sd_storage:
+               __sdt_free(cpu_map); /* fall through */
+       case sa_none:
+               break;
+       }
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+                                                  const struct cpumask *cpu_map)
+{
+       memset(d, 0, sizeof(*d));
+
+       if (__sdt_alloc(cpu_map))
+               return sa_sd_storage;
+       d->sd = alloc_percpu(struct sched_domain *);
+       if (!d->sd)
+               return sa_sd_storage;
+       d->rd = alloc_rootdomain();
+       if (!d->rd)
+               return sa_sd;
+       return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+       struct sd_data *sdd = sd->private;
+
+       WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+       *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+       if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+               *per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+       if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+               *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
+{
+       return topology_thread_cpumask(cpu);
+}
+#endif
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { sd_init_SIBLING, cpu_smt_mask, },
+#endif
+#ifdef CONFIG_SCHED_MC
+       { sd_init_MC, cpu_coregroup_mask, },
+#endif
+#ifdef CONFIG_SCHED_BOOK
+       { sd_init_BOOK, cpu_book_mask, },
+#endif
+       { sd_init_CPU, cpu_cpu_mask, },
+#ifdef CONFIG_NUMA
+       { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
+       { sd_init_ALLNODES, cpu_allnodes_mask, },
+#endif
+       { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+       struct sched_domain_topology_level *tl;
+       int j;
+
+       for (tl = sched_domain_topology; tl->init; tl++) {
+               struct sd_data *sdd = &tl->data;
+
+               sdd->sd = alloc_percpu(struct sched_domain *);
+               if (!sdd->sd)
+                       return -ENOMEM;
+
+               sdd->sg = alloc_percpu(struct sched_group *);
+               if (!sdd->sg)
+                       return -ENOMEM;
+
+               sdd->sgp = alloc_percpu(struct sched_group_power *);
+               if (!sdd->sgp)
+                       return -ENOMEM;
+
+               for_each_cpu(j, cpu_map) {
+                       struct sched_domain *sd;
+                       struct sched_group *sg;
+                       struct sched_group_power *sgp;
+
+                       sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sd)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sd, j) = sd;
+
+                       sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sg)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sg, j) = sg;
+
+                       sgp = kzalloc_node(sizeof(struct sched_group_power),
+                                       GFP_KERNEL, cpu_to_node(j));
+                       if (!sgp)
+                               return -ENOMEM;
+
+                       *per_cpu_ptr(sdd->sgp, j) = sgp;
+               }
+       }
+
+       return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+       struct sched_domain_topology_level *tl;
+       int j;
+
+       for (tl = sched_domain_topology; tl->init; tl++) {
+               struct sd_data *sdd = &tl->data;
+
+               for_each_cpu(j, cpu_map) {
+                       struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                       if (sd && (sd->flags & SD_OVERLAP))
+                               free_sched_groups(sd->groups, 0);
+                       kfree(*per_cpu_ptr(sdd->sd, j));
+                       kfree(*per_cpu_ptr(sdd->sg, j));
+                       kfree(*per_cpu_ptr(sdd->sgp, j));
+               }
+               free_percpu(sdd->sd);
+               free_percpu(sdd->sg);
+               free_percpu(sdd->sgp);
+       }
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+               struct s_data *d, const struct cpumask *cpu_map,
+               struct sched_domain_attr *attr, struct sched_domain *child,
+               int cpu)
+{
+       struct sched_domain *sd = tl->init(tl, cpu);
+       if (!sd)
+               return child;
+
+       set_domain_attribute(sd, attr);
+       cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+       if (child) {
+               sd->level = child->level + 1;
+               sched_domain_level_max = max(sched_domain_level_max, sd->level);
+               child->parent = sd;
+       }
+       sd->child = child;
+
+       return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+                              struct sched_domain_attr *attr)
+{
+       enum s_alloc alloc_state = sa_none;
+       struct sched_domain *sd;
+       struct s_data d;
+       int i, ret = -ENOMEM;
+
+       alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+       if (alloc_state != sa_rootdomain)
+               goto error;
+
+       /* Set up domains for cpus specified by the cpu_map. */
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain_topology_level *tl;
+
+               sd = NULL;
+               for (tl = sched_domain_topology; tl->init; tl++) {
+                       sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+                       if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+                               sd->flags |= SD_OVERLAP;
+                       if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+                               break;
+               }
+
+               while (sd->child)
+                       sd = sd->child;
+
+               *per_cpu_ptr(d.sd, i) = sd;
+       }
+
+       /* Build the groups for the domains */
+       for_each_cpu(i, cpu_map) {
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                       sd->span_weight = cpumask_weight(sched_domain_span(sd));
+                       if (sd->flags & SD_OVERLAP) {
+                               if (build_overlap_sched_groups(sd, i))
+                                       goto error;
+                       } else {
+                               if (build_sched_groups(sd, i))
+                                       goto error;
+                       }
+               }
+       }
+
+       /* Calculate CPU power for physical packages and nodes */
+       for (i = nr_cpumask_bits-1; i >= 0; i--) {
+               if (!cpumask_test_cpu(i, cpu_map))
+                       continue;
+
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                       claim_allocations(i, sd);
+                       init_sched_groups_power(i, sd);
+               }
+       }
+
+       /* Attach the domains */
+       rcu_read_lock();
+       for_each_cpu(i, cpu_map) {
+               sd = *per_cpu_ptr(d.sd, i);
+               cpu_attach_domain(sd, d.rd, i);
+       }
+       rcu_read_unlock();
+
+       ret = 0;
+error:
+       __free_domain_allocs(&d, alloc_state, cpu_map);
+       return ret;
+}
+
+static cpumask_var_t *doms_cur;        /* current sched domains */
+static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+                               /* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
+{
+       return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+       int i;
+       cpumask_var_t *doms;
+
+       doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+       if (!doms)
+               return NULL;
+       for (i = 0; i < ndoms; i++) {
+               if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+                       free_sched_domains(doms, i);
+                       return NULL;
+               }
+       }
+       return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+       unsigned int i;
+       for (i = 0; i < ndoms; i++)
+               free_cpumask_var(doms[i]);
+       kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+       int err;
+
+       arch_update_cpu_topology();
+       ndoms_cur = 1;
+       doms_cur = alloc_sched_domains(ndoms_cur);
+       if (!doms_cur)
+               doms_cur = &fallback_doms;
+       cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+       dattr_cur = NULL;
+       err = build_sched_domains(doms_cur[0], NULL);
+       register_sched_domain_sysctl();
+
+       return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+       int i;
+
+       rcu_read_lock();
+       for_each_cpu(i, cpu_map)
+               cpu_attach_domain(NULL, &def_root_domain, i);
+       rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+                       struct sched_domain_attr *new, int idx_new)
+{
+       struct sched_domain_attr tmp;
+
+       /* fast path */
+       if (!new && !cur)
+               return 1;
+
+       tmp = SD_ATTR_INIT;
+       return !memcmp(cur ? (cur + idx_cur) : &tmp,
+                       new ? (new + idx_new) : &tmp,
+                       sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+                            struct sched_domain_attr *dattr_new)
+{
+       int i, j, n;
+       int new_topology;
+
+       mutex_lock(&sched_domains_mutex);
+
+       /* always unregister in case we don't destroy any domains */
+       unregister_sched_domain_sysctl();
+
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+
+       n = doms_new ? ndoms_new : 0;
+
+       /* Destroy deleted domains */
+       for (i = 0; i < ndoms_cur; i++) {
+               for (j = 0; j < n && !new_topology; j++) {
+                       if (cpumask_equal(doms_cur[i], doms_new[j])
+                           && dattrs_equal(dattr_cur, i, dattr_new, j))
+                               goto match1;
+               }
+               /* no match - a current sched domain not in new doms_new[] */
+               detach_destroy_domains(doms_cur[i]);
+match1:
+               ;
+       }
+
+       if (doms_new == NULL) {
+               ndoms_cur = 0;
+               doms_new = &fallback_doms;
+               cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+               WARN_ON_ONCE(dattr_new);
+       }
+
+       /* Build new domains */
+       for (i = 0; i < ndoms_new; i++) {
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
+                       if (cpumask_equal(doms_new[i], doms_cur[j])
+                           && dattrs_equal(dattr_new, i, dattr_cur, j))
+                               goto match2;
+               }
+               /* no match - add a new doms_new */
+               build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+               ;
+       }
+
+       /* Remember the new sched domains */
+       if (doms_cur != &fallback_doms)
+               free_sched_domains(doms_cur, ndoms_cur);
+       kfree(dattr_cur);       /* kfree(NULL) is safe */
+       doms_cur = doms_new;
+       dattr_cur = dattr_new;
+       ndoms_cur = ndoms_new;
+
+       register_sched_domain_sysctl();
+
+       mutex_unlock(&sched_domains_mutex);
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+static void reinit_sched_domains(void)
+{
+       get_online_cpus();
+
+       /* Destroy domains first to force the rebuild */
+       partition_sched_domains(0, NULL, NULL);
+
+       rebuild_sched_domains();
+       put_online_cpus();
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+       unsigned int level = 0;
+
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
+
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
+               return -EINVAL;
+
+       if (smt)
+               sched_smt_power_savings = level;
+       else
+               sched_mc_power_savings = level;
+
+       reinit_sched_domains();
+
+       return count;
+}
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                          struct sysdev_class_attribute *attr,
+                                          char *page)
+{
+       return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
+                                           const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 0);
+}
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+                        sched_mc_power_savings_show,
+                        sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                           struct sysdev_class_attribute *attr,
+                                           char *page)
+{
+       return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
+                                            const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 1);
+}
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+                  sched_smt_power_savings_show,
+                  sched_smt_power_savings_store);
+#endif
+
+int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+       int err = 0;
+
+#ifdef CONFIG_SCHED_SMT
+       if (smt_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+       if (!err && mc_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_mc_power_savings.attr);
+#endif
+       return err;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+                            void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               cpuset_update_active_cpus();
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+                              void *hcpu)
+{
+       switch (action & ~CPU_TASKS_FROZEN) {
+       case CPU_DOWN_PREPARE:
+               cpuset_update_active_cpus();
+               return NOTIFY_OK;
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+void __init sched_init_smp(void)
+{
+       cpumask_var_t non_isolated_cpus;
+
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+       get_online_cpus();
+       mutex_lock(&sched_domains_mutex);
+       init_sched_domains(cpu_active_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+       mutex_unlock(&sched_domains_mutex);
+       put_online_cpus();
+
+       hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+       hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+       /* RT runtime code needs to handle some hotplug events */
+       hotcpu_notifier(update_runtime, 0);
+
+       init_hrtick();
+
+       /* Move init over to a non-isolated CPU */
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+               BUG();
+       sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+
+       init_sched_rt_class();
+}
+#else
+void __init sched_init_smp(void)
+{
+       sched_init_granularity();
+}
+#endif /* CONFIG_SMP */
+
+const_debug unsigned int sysctl_timer_migration = 1;
+
+int in_sched_functions(unsigned long addr)
+{
+       return in_lock_functions(addr) ||
+               (addr >= (unsigned long)__sched_text_start
+               && addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+struct task_group root_task_group;
+#endif
+
+DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
+void __init sched_init(void)
+{
+       int i, j;
+       unsigned long alloc_size = 0, ptr;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+       alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+#endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+       alloc_size += num_possible_cpus() * cpumask_size();
+#endif
+       if (alloc_size) {
+               ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+               root_task_group.se = (struct sched_entity **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+               root_task_group.cfs_rq = (struct cfs_rq **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
+               root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+               root_task_group.rt_rq = (struct rt_rq **)ptr;
+               ptr += nr_cpu_ids * sizeof(void **);
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+               for_each_possible_cpu(i) {
+                       per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                       ptr += cpumask_size();
+               }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
+       }
+
+#ifdef CONFIG_SMP
+       init_defrootdomain();
+#endif
+
+       init_rt_bandwidth(&def_rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       init_rt_bandwidth(&root_task_group.rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_SCHED
+       list_add(&root_task_group.list, &task_groups);
+       INIT_LIST_HEAD(&root_task_group.children);
+       INIT_LIST_HEAD(&root_task_group.siblings);
+       autogroup_init(&init_task);
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_CPUACCT
+       root_cpuacct.cpustat = &kernel_cpustat;
+       root_cpuacct.cpuusage = alloc_percpu(u64);
+       /* Too early, not expected to fail */
+       BUG_ON(!root_cpuacct.cpuusage);
+#endif
+       for_each_possible_cpu(i) {
+               struct rq *rq;
+
+               rq = cpu_rq(i);
+               raw_spin_lock_init(&rq->lock);
+               rq->nr_running = 0;
+               rq->calc_load_active = 0;
+               rq->calc_load_update = jiffies + LOAD_FREQ;
+               init_cfs_rq(&rq->cfs);
+               init_rt_rq(&rq->rt, rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+               root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+               INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+               /*
+                * How much cpu bandwidth does root_task_group get?
+                *
+                * In case of task-groups formed thr' the cgroup filesystem, it
+                * gets 100% of the cpu resources in the system. This overall
+                * system cpu resource is divided among the tasks of
+                * root_task_group and its child task-groups in a fair manner,
+                * based on each entity's (task or task-group's) weight
+                * (se->load.weight).
+                *
+                * In other words, if root_task_group has 10 tasks of weight
+                * 1024) and two child groups A0 and A1 (of weight 1024 each),
+                * then A0's share of the cpu resource is:
+                *
+                *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                *
+                * We achieve this by letting root_task_group's tasks sit
+                * directly in rq->cfs (i.e root_task_group->se[] = NULL).
+                */
+               init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+               init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+               rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#ifdef CONFIG_RT_GROUP_SCHED
+               INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+               init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
+#endif
+
+               for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
+                       rq->cpu_load[j] = 0;
+
+               rq->last_load_update_tick = jiffies;
+
+#ifdef CONFIG_SMP
+               rq->sd = NULL;
+               rq->rd = NULL;
+               rq->cpu_power = SCHED_POWER_SCALE;
+               rq->post_schedule = 0;
+               rq->active_balance = 0;
+               rq->next_balance = jiffies;
+               rq->push_cpu = 0;
+               rq->cpu = i;
+               rq->online = 0;
+               rq->idle_stamp = 0;
+               rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+               rq->nohz_flags = 0;
+#endif
+#endif
+               init_rq_hrtick(rq);
+               atomic_set(&rq->nr_iowait, 0);
+       }
+
+       set_load_weight(&init_task);
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+       INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+#ifdef CONFIG_RT_MUTEXES
+       plist_head_init(&init_task.pi_waiters);
+#endif
+
+       /*
+        * The boot idle thread does lazy MMU switching as well:
+        */
+       atomic_inc(&init_mm.mm_count);
+       enter_lazy_tlb(&init_mm, current);
+
+       /*
+        * Make us the idle thread. Technically, schedule() should not be
+        * called from this thread, however somewhere below it might be,
+        * but because we are the idle thread, we just pick up running again
+        * when this runqueue becomes "idle".
+        */
+       init_idle(current, smp_processor_id());
+
+       calc_load_update = jiffies + LOAD_FREQ;
+
+       /*
+        * During early bootup we pretend to be a normal task:
+        */
+       current->sched_class = &fair_sched_class;
+
+#ifdef CONFIG_SMP
+       zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+       /* May be allocated at isolcpus cmdline parse time */
+       if (cpu_isolated_map == NULL)
+               zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+#endif
+       init_sched_fair_class();
+
+       scheduler_running = 1;
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+       int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+
+       return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+       static unsigned long prev_jiffy;        /* ratelimiting */
+
+       rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+       if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+           system_state != SYSTEM_RUNNING || oops_in_progress)
+               return;
+       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+               return;
+       prev_jiffy = jiffies;
+
+       printk(KERN_ERR
+               "BUG: sleeping function called from invalid context at %s:%d\n",
+                       file, line);
+       printk(KERN_ERR
+               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(),
+                       current->pid, current->comm);
+
+       debug_show_held_locks(current);
+       if (irqs_disabled())
+               print_irqtrace_events(current);
+       dump_stack();
+}
+EXPORT_SYMBOL(__might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void normalize_task(struct rq *rq, struct task_struct *p)
+{
+       const struct sched_class *prev_class = p->sched_class;
+       int old_prio = p->prio;
+       int on_rq;
+
+       on_rq = p->on_rq;
+       if (on_rq)
+               deactivate_task(rq, p, 0);
+       __setscheduler(rq, p, SCHED_NORMAL, 0);
+       if (on_rq) {
+               activate_task(rq, p, 0);
+               resched_task(rq->curr);
+       }
+
+       check_class_changed(rq, p, prev_class, old_prio);
+}
+
+void normalize_rt_tasks(void)
+{
+       struct task_struct *g, *p;
+       unsigned long flags;
+       struct rq *rq;
+
+       read_lock_irqsave(&tasklist_lock, flags);
+       do_each_thread(g, p) {
+               /*
+                * Only normalize user tasks:
+                */
+               if (!p->mm)
+                       continue;
+
+               p->se.exec_start                = 0;
+#ifdef CONFIG_SCHEDSTATS
+               p->se.statistics.wait_start     = 0;
+               p->se.statistics.sleep_start    = 0;
+               p->se.statistics.block_start    = 0;
+#endif
+
+               if (!rt_task(p)) {
+                       /*
+                        * Renice negative nice level userspace
+                        * tasks back to 0:
+                        */
+                       if (TASK_NICE(p) < 0 && p->mm)
+                               set_user_nice(p, 0);
+                       continue;
+               }
+
+               raw_spin_lock(&p->pi_lock);
+               rq = __task_rq_lock(p);
+
+               normalize_task(rq, p);
+
+               __task_rq_unlock(rq);
+               raw_spin_unlock(&p->pi_lock);
+       } while_each_thread(g, p);
+
+       read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+struct task_struct *curr_task(int cpu)
+{
+       return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+       cpu_curr(cpu) = p;
+}
+
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+#else /* !CONFIG_RT_GROUP_SCHED */
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+
+static void free_sched_group(struct task_group *tg)
+{
+       free_fair_sched_group(tg);
+       free_rt_sched_group(tg);
+       autogroup_free(tg);
+       kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+       struct task_group *tg;
+       unsigned long flags;
+
+       tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+       if (!tg)
+               return ERR_PTR(-ENOMEM);
+
+       if (!alloc_fair_sched_group(tg, parent))
+               goto err;
+
+       if (!alloc_rt_sched_group(tg, parent))
+               goto err;
+
+       spin_lock_irqsave(&task_group_lock, flags);
+       list_add_rcu(&tg->list, &task_groups);
+
+       WARN_ON(!parent); /* root should already exist */
+
+       tg->parent = parent;
+       INIT_LIST_HEAD(&tg->children);
+       list_add_rcu(&tg->siblings, &parent->children);
+       spin_unlock_irqrestore(&task_group_lock, flags);
+
+       return tg;
+
+err:
+       free_sched_group(tg);
+       return ERR_PTR(-ENOMEM);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void free_sched_group_rcu(struct rcu_head *rhp)
+{
+       /* now it should be safe to free those cfs_rqs */
+       free_sched_group(container_of(rhp, struct task_group, rcu));
+}
+
+/* Destroy runqueue etc associated with a task group */
+void sched_destroy_group(struct task_group *tg)
+{
+       unsigned long flags;
+       int i;
+
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
+               unregister_fair_sched_group(tg, i);
+
+       spin_lock_irqsave(&task_group_lock, flags);
+       list_del_rcu(&tg->list);
+       list_del_rcu(&tg->siblings);
+       spin_unlock_irqrestore(&task_group_lock, flags);
+
+       /* wait for possible concurrent references to cfs_rqs complete */
+       call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+
+/* change task's runqueue when it moves between groups.
+ *     The caller of this function should have put the task in its new group
+ *     by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
+ *     reflect its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+       int on_rq, running;
+       unsigned long flags;
+       struct rq *rq;
+
+       rq = task_rq_lock(tsk, &flags);
+
+       running = task_current(rq, tsk);
+       on_rq = tsk->on_rq;
+
+       if (on_rq)
+               dequeue_task(rq, tsk, 0);
+       if (unlikely(running))
+               tsk->sched_class->put_prev_task(rq, tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       if (tsk->sched_class->task_move_group)
+               tsk->sched_class->task_move_group(tsk, on_rq);
+       else
+#endif
+               set_task_rq(tsk, task_cpu(tsk));
+
+       if (unlikely(running))
+               tsk->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, tsk, 0);
+
+       task_rq_unlock(rq, tsk, &flags);
+}
+#endif /* CONFIG_CGROUP_SCHED */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#endif
+
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 20;
+
+       return div64_u64(runtime << 20, period);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+       struct task_struct *g, *p;
+
+       do_each_thread(g, p) {
+               if (rt_task(p) && task_rq(p)->rt.tg == tg)
+                       return 1;
+       } while_each_thread(g, p);
+
+       return 0;
+}
+
+struct rt_schedulable_data {
+       struct task_group *tg;
+       u64 rt_period;
+       u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+       struct rt_schedulable_data *d = data;
+       struct task_group *child;
+       unsigned long total, sum = 0;
+       u64 period, runtime;
+
+       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       runtime = tg->rt_bandwidth.rt_runtime;
+
+       if (tg == d->tg) {
+               period = d->rt_period;
+               runtime = d->rt_runtime;
+       }
+
+       /*
+        * Cannot have more runtime than the period.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
+
+       /*
+        * Ensure we don't starve existing RT tasks.
+        */
+       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+               return -EBUSY;
+
+       total = to_ratio(period, runtime);
+
+       /*
+        * Nobody can have more than the global setting allows.
+        */
+       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+               return -EINVAL;
+
+       /*
+        * The sum of our children's runtime should not exceed our own.
+        */
+       list_for_each_entry_rcu(child, &tg->children, siblings) {
+               period = ktime_to_ns(child->rt_bandwidth.rt_period);
+               runtime = child->rt_bandwidth.rt_runtime;
+
+               if (child == d->tg) {
+                       period = d->rt_period;
+                       runtime = d->rt_runtime;
+               }
+
+               sum += to_ratio(period, runtime);
+       }
+
+       if (sum > total)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+       int ret;
+
+       struct rt_schedulable_data data = {
+               .tg = tg,
+               .rt_period = period,
+               .rt_runtime = runtime,
+       };
+
+       rcu_read_lock();
+       ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+               u64 rt_period, u64 rt_runtime)
+{
+       int i, err = 0;
+
+       mutex_lock(&rt_constraints_mutex);
+       read_lock(&tasklist_lock);
+       err = __rt_schedulable(tg, rt_period, rt_runtime);
+       if (err)
+               goto unlock;
+
+       raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+       tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+       tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+       for_each_possible_cpu(i) {
+               struct rt_rq *rt_rq = tg->rt_rq[i];
+
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_runtime = rt_runtime;
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+       }
+       raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+       read_unlock(&tasklist_lock);
+       mutex_unlock(&rt_constraints_mutex);
+
+       return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+       u64 rt_runtime, rt_period;
+
+       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+       if (rt_runtime_us < 0)
+               rt_runtime = RUNTIME_INF;
+
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+       u64 rt_runtime_us;
+
+       if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+               return -1;
+
+       rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+       do_div(rt_runtime_us, NSEC_PER_USEC);
+       return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+{
+       u64 rt_runtime, rt_period;
+
+       rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+       rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+       if (rt_period == 0)
+               return -EINVAL;
+
+       return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+       u64 rt_period_us;
+
+       rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       do_div(rt_period_us, NSEC_PER_USEC);
+       return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+       u64 runtime, period;
+       int ret = 0;
+
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       runtime = global_rt_runtime();
+       period = global_rt_period();
+
+       /*
+        * Sanity check on the sysctl variables.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
+
+       mutex_lock(&rt_constraints_mutex);
+       read_lock(&tasklist_lock);
+       ret = __rt_schedulable(NULL, 0, 0);
+       read_unlock(&tasklist_lock);
+       mutex_unlock(&rt_constraints_mutex);
+
+       return ret;
+}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+       /* Don't accept realtime tasks when there is no way for them to run */
+       if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+               return 0;
+
+       return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+       unsigned long flags;
+       int i;
+
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       /*
+        * There's always some RT tasks in the root group
+        * -- migration, kstopmachine etc..
+        */
+       if (sysctl_sched_rt_runtime == 0)
+               return -EBUSY;
+
+       raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+       for_each_possible_cpu(i) {
+               struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_runtime = global_rt_runtime();
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+       }
+       raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+       return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+int sched_rt_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+       int old_period, old_runtime;
+       static DEFINE_MUTEX(mutex);
+
+       mutex_lock(&mutex);
+       old_period = sysctl_sched_rt_period;
+       old_runtime = sysctl_sched_rt_runtime;
+
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (!ret && write) {
+               ret = sched_rt_global_constraints();
+               if (ret) {
+                       sysctl_sched_rt_period = old_period;
+                       sysctl_sched_rt_runtime = old_runtime;
+               } else {
+                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
+                       def_rt_bandwidth.rt_period =
+                               ns_to_ktime(global_rt_period());
+               }
+       }
+       mutex_unlock(&mutex);
+
+       return ret;
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/* return corresponding task_group object of a cgroup */
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
+{
+       return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
+                           struct task_group, css);
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       struct task_group *tg, *parent;
+
+       if (!cgrp->parent) {
+               /* This is early initialization for the top cgroup */
+               return &root_task_group.css;
+       }
+
+       parent = cgroup_tg(cgrp->parent);
+       tg = sched_create_group(parent);
+       if (IS_ERR(tg))
+               return ERR_PTR(-ENOMEM);
+
+       return &tg->css;
+}
+
+static void
+cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       struct task_group *tg = cgroup_tg(cgrp);
+
+       sched_destroy_group(tg);
+}
+
+static int
+cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+       if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
+               return -EINVAL;
+#else
+       /* We don't support RT-tasks being in separate groups */
+       if (tsk->sched_class != &fair_sched_class)
+               return -EINVAL;
+#endif
+       return 0;
+}
+
+static void
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
+       sched_move_task(tsk);
+}
+
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+               struct cgroup *old_cgrp, struct task_struct *task)
+{
+       /*
+        * cgroup_exit() is called in the copy_process() failure path.
+        * Ignore this case since the task hasn't ran yet, this avoids
+        * trying to poke a half freed task state from generic code.
+        */
+       if (!(task->flags & PF_EXITING))
+               return;
+
+       sched_move_task(task);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                               u64 shareval)
+{
+       return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+}
+
+static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+       struct task_group *tg = cgroup_tg(cgrp);
+
+       return (u64) scale_load_down(tg->shares);
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+       int i, ret = 0, runtime_enabled, runtime_was_enabled;
+       struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+       if (tg == &root_task_group)
+               return -EINVAL;
+
+       /*
+        * Ensure we have at some amount of bandwidth every period.  This is
+        * to prevent reaching a state of large arrears when throttled via
+        * entity_tick() resulting in prolonged exit starvation.
+        */
+       if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+               return -EINVAL;
+
+       /*
+        * Likewise, bound things on the otherside by preventing insane quota
+        * periods.  This also allows us to normalize in computing quota
+        * feasibility.
+        */
+       if (period > max_cfs_quota_period)
+               return -EINVAL;
+
+       mutex_lock(&cfs_constraints_mutex);
+       ret = __cfs_schedulable(tg, period, quota);
+       if (ret)
+               goto out_unlock;
+
+       runtime_enabled = quota != RUNTIME_INF;
+       runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+       account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+       raw_spin_lock_irq(&cfs_b->lock);
+       cfs_b->period = ns_to_ktime(period);
+       cfs_b->quota = quota;
+
+       __refill_cfs_bandwidth_runtime(cfs_b);
+       /* restart the period timer (if active) to handle new period expiry */
+       if (runtime_enabled && cfs_b->timer_active) {
+               /* force a reprogram */
+               cfs_b->timer_active = 0;
+               __start_cfs_bandwidth(cfs_b);
+       }
+       raw_spin_unlock_irq(&cfs_b->lock);
+
+       for_each_possible_cpu(i) {
+               struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+               struct rq *rq = cfs_rq->rq;
+
+               raw_spin_lock_irq(&rq->lock);
+               cfs_rq->runtime_enabled = runtime_enabled;
+               cfs_rq->runtime_remaining = 0;
+
+               if (cfs_rq->throttled)
+                       unthrottle_cfs_rq(cfs_rq);
+               raw_spin_unlock_irq(&rq->lock);
+       }
+out_unlock:
+       mutex_unlock(&cfs_constraints_mutex);
+
+       return ret;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+       u64 quota, period;
+
+       period = ktime_to_ns(tg->cfs_bandwidth.period);
+       if (cfs_quota_us < 0)
+               quota = RUNTIME_INF;
+       else
+               quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+       u64 quota_us;
+
+       if (tg->cfs_bandwidth.quota == RUNTIME_INF)
+               return -1;
+
+       quota_us = tg->cfs_bandwidth.quota;
+       do_div(quota_us, NSEC_PER_USEC);
+
+       return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+       u64 quota, period;
+
+       period = (u64)cfs_period_us * NSEC_PER_USEC;
+       quota = tg->cfs_bandwidth.quota;
+
+       return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+       u64 cfs_period_us;
+
+       cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+       do_div(cfs_period_us, NSEC_PER_USEC);
+
+       return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                               s64 cfs_quota_us)
+{
+       return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+       return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                               u64 cfs_period_us)
+{
+       return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+struct cfs_schedulable_data {
+       struct task_group *tg;
+       u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+                              struct cfs_schedulable_data *d)
+{
+       u64 quota, period;
+
+       if (tg == d->tg) {
+               period = d->period;
+               quota = d->quota;
+       } else {
+               period = tg_get_cfs_period(tg);
+               quota = tg_get_cfs_quota(tg);
+       }
+
+       /* note: these should typically be equivalent */
+       if (quota == RUNTIME_INF || quota == -1)
+               return RUNTIME_INF;
+
+       return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+       struct cfs_schedulable_data *d = data;
+       struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+       s64 quota = 0, parent_quota = -1;
+
+       if (!tg->parent) {
+               quota = RUNTIME_INF;
+       } else {
+               struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
+
+               quota = normalize_cfs_quota(tg, d);
+               parent_quota = parent_b->hierarchal_quota;
+
+               /*
+                * ensure max(child_quota) <= parent_quota, inherit when no
+                * limit is set
+                */
+               if (quota == RUNTIME_INF)
+                       quota = parent_quota;
+               else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+                       return -EINVAL;
+       }
+       cfs_b->hierarchal_quota = quota;
+
+       return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+       int ret;
+       struct cfs_schedulable_data data = {
+               .tg = tg,
+               .period = period,
+               .quota = quota,
+       };
+
+       if (quota != RUNTIME_INF) {
+               do_div(data.period, NSEC_PER_USEC);
+               do_div(data.quota, NSEC_PER_USEC);
+       }
+
+       rcu_read_lock();
+       ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+{
+       struct task_group *tg = cgroup_tg(cgrp);
+       struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+       cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+       cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+       cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+       return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+                               s64 val)
+{
+       return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+}
+
+static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       return sched_group_rt_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+               u64 rt_period_us)
+{
+       return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+}
+
+static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+       return sched_group_rt_period(cgroup_tg(cgrp));
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       {
+               .name = "shares",
+               .read_u64 = cpu_shares_read_u64,
+               .write_u64 = cpu_shares_write_u64,
+       },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .name = "cfs_quota_us",
+               .read_s64 = cpu_cfs_quota_read_s64,
+               .write_s64 = cpu_cfs_quota_write_s64,
+       },
+       {
+               .name = "cfs_period_us",
+               .read_u64 = cpu_cfs_period_read_u64,
+               .write_u64 = cpu_cfs_period_write_u64,
+       },
+       {
+               .name = "stat",
+               .read_map = cpu_stats_show,
+       },
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+       {
+               .name = "rt_runtime_us",
+               .read_s64 = cpu_rt_runtime_read,
+               .write_s64 = cpu_rt_runtime_write,
+       },
+       {
+               .name = "rt_period_us",
+               .read_u64 = cpu_rt_period_read_uint,
+               .write_u64 = cpu_rt_period_write_uint,
+       },
+#endif
+};
+
+static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+       return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
+}
+
+struct cgroup_subsys cpu_cgroup_subsys = {
+       .name           = "cpu",
+       .create         = cpu_cgroup_create,
+       .destroy        = cpu_cgroup_destroy,
+       .can_attach_task = cpu_cgroup_can_attach_task,
+       .attach_task    = cpu_cgroup_attach_task,
+       .exit           = cpu_cgroup_exit,
+       .populate       = cpu_cgroup_populate,
+       .subsys_id      = cpu_cgroup_subsys_id,
+       .early_init     = 1,
+};
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+#ifdef CONFIG_CGROUP_CPUACCT
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_create(
+       struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       struct cpuacct *ca;
+
+       if (!cgrp->parent)
+               return &root_cpuacct.css;
+
+       ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       if (!ca)
+               goto out;
+
+       ca->cpuusage = alloc_percpu(u64);
+       if (!ca->cpuusage)
+               goto out_free_ca;
+
+       ca->cpustat = alloc_percpu(struct kernel_cpustat);
+       if (!ca->cpustat)
+               goto out_free_cpuusage;
+
+       return &ca->css;
+
+out_free_cpuusage:
+       free_percpu(ca->cpuusage);
+out_free_ca:
+       kfree(ca);
+out:
+       return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void
+cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+
+       free_percpu(ca->cpustat);
+       free_percpu(ca->cpuusage);
+       kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       data = *cpuusage;
+#endif
+
+       return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       u64 totalcpuusage = 0;
+       int i;
+
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+       return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+                                                               u64 reset)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int err = 0;
+       int i;
+
+       if (reset) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+       return err;
+}
+
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+{
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
+static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                             struct cgroup_map_cb *cb)
+{
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int cpu;
+       s64 val = 0;
+
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_USER];
+               val += kcpustat->cpustat[CPUTIME_NICE];
+       }
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+       val = 0;
+       for_each_online_cpu(cpu) {
+               struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+               val += kcpustat->cpustat[CPUTIME_SYSTEM];
+               val += kcpustat->cpustat[CPUTIME_IRQ];
+               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+       }
+
+       val = cputime64_to_clock_t(val);
+       cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+       return 0;
+}
+
+static struct cftype files[] = {
+       {
+               .name = "usage",
+               .read_u64 = cpuusage_read,
+               .write_u64 = cpuusage_write,
+       },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
+};
+
+static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+       return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
+}
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+       struct cpuacct *ca;
+       int cpu;
+
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+
+       cpu = task_cpu(tsk);
+
+       rcu_read_lock();
+
+       ca = task_ca(tsk);
+
+       for (; ca; ca = parent_ca(ca)) {
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+               *cpuusage += cputime;
+       }
+
+       rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_subsys = {
+       .name = "cpuacct",
+       .create = cpuacct_create,
+       .destroy = cpuacct_destroy,
+       .populate = cpuacct_populate,
+       .subsys_id = cpuacct_subsys_id,
+};
+#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
new file mode 100644 (file)
index 0000000..b0d798e
--- /dev/null
@@ -0,0 +1,241 @@
+/*
+ *  kernel/sched/cpupri.c
+ *
+ *  CPU priority management
+ *
+ *  Copyright (C) 2007-2008 Novell
+ *
+ *  Author: Gregory Haskins <ghaskins@novell.com>
+ *
+ *  This code tracks the priority of each CPU so that global migration
+ *  decisions are easy to calculate.  Each CPU can be in a state as follows:
+ *
+ *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
+ *
+ *  going from the lowest priority to the highest.  CPUs in the INVALID state
+ *  are not eligible for routing.  The system maintains this state with
+ *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  in that class).  Therefore a typical application without affinity
+ *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
+ *  searches).  For tasks with affinity restrictions, the algorithm has a
+ *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ *  yields the worst case search is fairly contrived.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+
+#include <linux/gfp.h>
+#include "cpupri.h"
+
+/* Convert between a 140 based task->prio, and our 102 based cpupri */
+static int convert_prio(int prio)
+{
+       int cpupri;
+
+       if (prio == CPUPRI_INVALID)
+               cpupri = CPUPRI_INVALID;
+       else if (prio == MAX_PRIO)
+               cpupri = CPUPRI_IDLE;
+       else if (prio >= MAX_RT_PRIO)
+               cpupri = CPUPRI_NORMAL;
+       else
+               cpupri = MAX_RT_PRIO - prio + 1;
+
+       return cpupri;
+}
+
+/**
+ * cpupri_find - find the best (lowest-pri) CPU in the system
+ * @cp: The cpupri context
+ * @p: The task
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
+ *
+ * Note: This function returns the recommended CPUs as calculated during the
+ * current invocation.  By the time the call returns, the CPUs may have in
+ * fact changed priorities any number of times.  While not ideal, it is not
+ * an issue of correctness since the normal rebalancer logic will correct
+ * any discrepancies created by racing against the uncertainty of the current
+ * priority configuration.
+ *
+ * Returns: (int)bool - CPUs were found
+ */
+int cpupri_find(struct cpupri *cp, struct task_struct *p,
+               struct cpumask *lowest_mask)
+{
+       int                  idx      = 0;
+       int                  task_pri = convert_prio(p->prio);
+
+       if (task_pri >= MAX_RT_PRIO)
+               return 0;
+
+       for (idx = 0; idx < task_pri; idx++) {
+               struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+               int skip = 0;
+
+               if (!atomic_read(&(vec)->count))
+                       skip = 1;
+               /*
+                * When looking at the vector, we need to read the counter,
+                * do a memory barrier, then read the mask.
+                *
+                * Note: This is still all racey, but we can deal with it.
+                *  Ideally, we only want to look at masks that are set.
+                *
+                *  If a mask is not set, then the only thing wrong is that we
+                *  did a little more work than necessary.
+                *
+                *  If we read a zero count but the mask is set, because of the
+                *  memory barriers, that can only happen when the highest prio
+                *  task for a run queue has left the run queue, in which case,
+                *  it will be followed by a pull. If the task we are processing
+                *  fails to find a proper place to go, that pull request will
+                *  pull this task if the run queue is running at a lower
+                *  priority.
+                */
+               smp_rmb();
+
+               /* Need to do the rmb for every iteration */
+               if (skip)
+                       continue;
+
+               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+                       continue;
+
+               if (lowest_mask) {
+                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+
+                       /*
+                        * We have to ensure that we have at least one bit
+                        * still set in the array, since the map could have
+                        * been concurrently emptied between the first and
+                        * second reads of vec->mask.  If we hit this
+                        * condition, simply act as though we never hit this
+                        * priority level and continue on.
+                        */
+                       if (cpumask_any(lowest_mask) >= nr_cpu_ids)
+                               continue;
+               }
+
+               return 1;
+       }
+
+       return 0;
+}
+
+/**
+ * cpupri_set - update the cpu priority setting
+ * @cp: The cpupri context
+ * @cpu: The target cpu
+ * @pri: The priority (INVALID-RT99) to assign to this CPU
+ *
+ * Note: Assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpupri_set(struct cpupri *cp, int cpu, int newpri)
+{
+       int                 *currpri = &cp->cpu_to_pri[cpu];
+       int                  oldpri  = *currpri;
+       int                  do_mb = 0;
+
+       newpri = convert_prio(newpri);
+
+       BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
+
+       if (newpri == oldpri)
+               return;
+
+       /*
+        * If the cpu was currently mapped to a different value, we
+        * need to map it to the new value then remove the old value.
+        * Note, we must add the new value first, otherwise we risk the
+        * cpu being missed by the priority loop in cpupri_find.
+        */
+       if (likely(newpri != CPUPRI_INVALID)) {
+               struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
+
+               cpumask_set_cpu(cpu, vec->mask);
+               /*
+                * When adding a new vector, we update the mask first,
+                * do a write memory barrier, and then update the count, to
+                * make sure the vector is visible when count is set.
+                */
+               smp_mb__before_atomic_inc();
+               atomic_inc(&(vec)->count);
+               do_mb = 1;
+       }
+       if (likely(oldpri != CPUPRI_INVALID)) {
+               struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
+
+               /*
+                * Because the order of modification of the vec->count
+                * is important, we must make sure that the update
+                * of the new prio is seen before we decrement the
+                * old prio. This makes sure that the loop sees
+                * one or the other when we raise the priority of
+                * the run queue. We don't care about when we lower the
+                * priority, as that will trigger an rt pull anyway.
+                *
+                * We only need to do a memory barrier if we updated
+                * the new priority vec.
+                */
+               if (do_mb)
+                       smp_mb__after_atomic_inc();
+
+               /*
+                * When removing from the vector, we decrement the counter first
+                * do a memory barrier and then clear the mask.
+                */
+               atomic_dec(&(vec)->count);
+               smp_mb__after_atomic_inc();
+               cpumask_clear_cpu(cpu, vec->mask);
+       }
+
+       *currpri = newpri;
+}
+
+/**
+ * cpupri_init - initialize the cpupri structure
+ * @cp: The cpupri context
+ * @bootmem: true if allocations need to use bootmem
+ *
+ * Returns: -ENOMEM if memory fails.
+ */
+int cpupri_init(struct cpupri *cp)
+{
+       int i;
+
+       memset(cp, 0, sizeof(*cp));
+
+       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
+               struct cpupri_vec *vec = &cp->pri_to_cpu[i];
+
+               atomic_set(&vec->count, 0);
+               if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+                       goto cleanup;
+       }
+
+       for_each_possible_cpu(i)
+               cp->cpu_to_pri[i] = CPUPRI_INVALID;
+       return 0;
+
+cleanup:
+       for (i--; i >= 0; i--)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+       return -ENOMEM;
+}
+
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+       int i;
+
+       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
new file mode 100644 (file)
index 0000000..f6d7561
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef _LINUX_CPUPRI_H
+#define _LINUX_CPUPRI_H
+
+#include <linux/sched.h>
+
+#define CPUPRI_NR_PRIORITIES   (MAX_RT_PRIO + 2)
+
+#define CPUPRI_INVALID -1
+#define CPUPRI_IDLE     0
+#define CPUPRI_NORMAL   1
+/* values 2-101 are RT priorities 0-99 */
+
+struct cpupri_vec {
+       atomic_t        count;
+       cpumask_var_t   mask;
+};
+
+struct cpupri {
+       struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+       int               cpu_to_pri[NR_CPUS];
+};
+
+#ifdef CONFIG_SMP
+int  cpupri_find(struct cpupri *cp,
+                struct task_struct *p, struct cpumask *lowest_mask);
+void cpupri_set(struct cpupri *cp, int cpu, int pri);
+int cpupri_init(struct cpupri *cp);
+void cpupri_cleanup(struct cpupri *cp);
+#else
+#define cpupri_set(cp, cpu, pri) do { } while (0)
+#define cpupri_init() do { } while (0)
+#endif
+
+#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
new file mode 100644 (file)
index 0000000..2a075e1
--- /dev/null
@@ -0,0 +1,510 @@
+/*
+ * kernel/sched/debug.c
+ *
+ * Print the CFS rbtree
+ *
+ * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+#include "sched.h"
+
+static DEFINE_SPINLOCK(sched_debug_lock);
+
+/*
+ * This allows printing both to /proc/sched_debug and
+ * to the console
+ */
+#define SEQ_printf(m, x...)                    \
+ do {                                          \
+       if (m)                                  \
+               seq_printf(m, x);               \
+       else                                    \
+               printk(x);                      \
+ } while (0)
+
+/*
+ * Ease the printing of nsec fields:
+ */
+static long long nsec_high(unsigned long long nsec)
+{
+       if ((long long)nsec < 0) {
+               nsec = -nsec;
+               do_div(nsec, 1000000);
+               return -nsec;
+       }
+       do_div(nsec, 1000000);
+
+       return nsec;
+}
+
+static unsigned long nsec_low(unsigned long long nsec)
+{
+       if ((long long)nsec < 0)
+               nsec = -nsec;
+
+       return do_div(nsec, 1000000);
+}
+
+#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
+{
+       struct sched_entity *se = tg->se[cpu];
+       if (!se)
+               return;
+
+#define P(F) \
+       SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+       SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+       PN(se->exec_start);
+       PN(se->vruntime);
+       PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+       PN(se->statistics.wait_start);
+       PN(se->statistics.sleep_start);
+       PN(se->statistics.block_start);
+       PN(se->statistics.sleep_max);
+       PN(se->statistics.block_max);
+       PN(se->statistics.exec_max);
+       PN(se->statistics.slice_max);
+       PN(se->statistics.wait_max);
+       PN(se->statistics.wait_sum);
+       P(se->statistics.wait_count);
+#endif
+       P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+       if (autogroup_path(tg, group_path, PATH_MAX))
+               return group_path;
+
+       /*
+        * May be NULL if the underlying cgroup isn't fully-created yet
+        */
+       if (!tg->css.cgroup) {
+               group_path[0] = '\0';
+               return group_path;
+       }
+       cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+       return group_path;
+}
+#endif
+
+static void
+print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+{
+       if (rq->curr == p)
+               SEQ_printf(m, "R");
+       else
+               SEQ_printf(m, " ");
+
+       SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
+               p->comm, p->pid,
+               SPLIT_NS(p->se.vruntime),
+               (long long)(p->nvcsw + p->nivcsw),
+               p->prio);
+#ifdef CONFIG_SCHEDSTATS
+       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+               SPLIT_NS(p->se.vruntime),
+               SPLIT_NS(p->se.sum_exec_runtime),
+               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+#else
+       SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
+               0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+       SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
+
+       SEQ_printf(m, "\n");
+}
+
+static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+{
+       struct task_struct *g, *p;
+       unsigned long flags;
+
+       SEQ_printf(m,
+       "\nrunnable tasks:\n"
+       "            task   PID         tree-key  switches  prio"
+       "     exec-runtime         sum-exec        sum-sleep\n"
+       "------------------------------------------------------"
+       "----------------------------------------------------\n");
+
+       read_lock_irqsave(&tasklist_lock, flags);
+
+       do_each_thread(g, p) {
+               if (!p->on_rq || task_cpu(p) != rq_cpu)
+                       continue;
+
+               print_task(m, rq, p);
+       } while_each_thread(g, p);
+
+       read_unlock_irqrestore(&tasklist_lock, flags);
+}
+
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+{
+       s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+               spread, rq0_min_vruntime, spread0;
+       struct rq *rq = cpu_rq(cpu);
+       struct sched_entity *last;
+       unsigned long flags;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
+       SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
+                       SPLIT_NS(cfs_rq->exec_clock));
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       if (cfs_rq->rb_leftmost)
+               MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+       last = __pick_last_entity(cfs_rq);
+       if (last)
+               max_vruntime = last->vruntime;
+       min_vruntime = cfs_rq->min_vruntime;
+       rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+                       SPLIT_NS(MIN_vruntime));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
+                       SPLIT_NS(min_vruntime));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
+                       SPLIT_NS(max_vruntime));
+       spread = max_vruntime - MIN_vruntime;
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
+                       SPLIT_NS(spread));
+       spread0 = min_vruntime - rq0_min_vruntime;
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
+                       SPLIT_NS(spread0));
+       SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
+                       cfs_rq->nr_spread_over);
+       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                       SPLIT_NS(cfs_rq->load_avg));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                       SPLIT_NS(cfs_rq->load_period));
+       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                       cfs_rq->load_contribution);
+       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                       atomic_read(&cfs_rq->tg->load_weight));
+#endif
+
+       print_cfs_group_stats(m, cpu, cfs_rq->tg);
+#endif
+}
+
+void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+       SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
+       SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
+
+#define P(x) \
+       SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PN(x) \
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
+
+       P(rt_nr_running);
+       P(rt_throttled);
+       PN(rt_time);
+       PN(rt_runtime);
+
+#undef PN
+#undef P
+}
+
+extern __read_mostly int sched_clock_running;
+
+static void print_cpu(struct seq_file *m, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+#ifdef CONFIG_X86
+       {
+               unsigned int freq = cpu_khz ? : 1;
+
+               SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                          cpu, freq / 1000, (freq % 1000));
+       }
+#else
+       SEQ_printf(m, "\ncpu#%d\n", cpu);
+#endif
+
+#define P(x) \
+       SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+#define PN(x) \
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
+
+       P(nr_running);
+       SEQ_printf(m, "  .%-30s: %lu\n", "load",
+                  rq->load.weight);
+       P(nr_switches);
+       P(nr_load_updates);
+       P(nr_uninterruptible);
+       PN(next_balance);
+       P(curr->pid);
+       PN(clock);
+       P(cpu_load[0]);
+       P(cpu_load[1]);
+       P(cpu_load[2]);
+       P(cpu_load[3]);
+       P(cpu_load[4]);
+#undef P
+#undef PN
+
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
+
+       P(yld_count);
+
+       P(sched_switch);
+       P(sched_count);
+       P(sched_goidle);
+#ifdef CONFIG_SMP
+       P64(avg_idle);
+#endif
+
+       P(ttwu_count);
+       P(ttwu_local);
+
+#undef P
+#undef P64
+#endif
+       spin_lock_irqsave(&sched_debug_lock, flags);
+       print_cfs_stats(m, cpu);
+       print_rt_stats(m, cpu);
+
+       rcu_read_lock();
+       print_rq(m, rq, cpu);
+       rcu_read_unlock();
+       spin_unlock_irqrestore(&sched_debug_lock, flags);
+}
+
+static const char *sched_tunable_scaling_names[] = {
+       "none",
+       "logaritmic",
+       "linear"
+};
+
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+       u64 ktime, sched_clk, cpu_clk;
+       unsigned long flags;
+       int cpu;
+
+       local_irq_save(flags);
+       ktime = ktime_to_ns(ktime_get());
+       sched_clk = sched_clock();
+       cpu_clk = local_clock();
+       local_irq_restore(flags);
+
+       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+               init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+
+#define P(x) \
+       SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+       SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+       PN(ktime);
+       PN(sched_clk);
+       PN(cpu_clk);
+       P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+       P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+       SEQ_printf(m, "\n");
+       SEQ_printf(m, "sysctl_sched\n");
+
+#define P(x) \
+       SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+       SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+       PN(sysctl_sched_latency);
+       PN(sysctl_sched_min_granularity);
+       PN(sysctl_sched_wakeup_granularity);
+       P(sysctl_sched_child_runs_first);
+       P(sysctl_sched_features);
+#undef PN
+#undef P
+
+       SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+               sysctl_sched_tunable_scaling,
+               sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+
+       for_each_online_cpu(cpu)
+               print_cpu(m, cpu);
+
+       SEQ_printf(m, "\n");
+
+       return 0;
+}
+
+void sysrq_sched_debug_show(void)
+{
+       sched_debug_show(NULL, NULL);
+}
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_debug_show, NULL);
+}
+
+static const struct file_operations sched_debug_fops = {
+       .open           = sched_debug_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int __init init_sched_debug_procfs(void)
+{
+       struct proc_dir_entry *pe;
+
+       pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
+       if (!pe)
+               return -ENOMEM;
+       return 0;
+}
+
+__initcall(init_sched_debug_procfs);
+
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+       unsigned long nr_switches;
+
+       SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+                                               get_nr_threads(p));
+       SEQ_printf(m,
+               "---------------------------------------------------------\n");
+#define __P(F) \
+       SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+       SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+       SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+       SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+
+       PN(se.exec_start);
+       PN(se.vruntime);
+       PN(se.sum_exec_runtime);
+
+       nr_switches = p->nvcsw + p->nivcsw;
+
+#ifdef CONFIG_SCHEDSTATS
+       PN(se.statistics.wait_start);
+       PN(se.statistics.sleep_start);
+       PN(se.statistics.block_start);
+       PN(se.statistics.sleep_max);
+       PN(se.statistics.block_max);
+       PN(se.statistics.exec_max);
+       PN(se.statistics.slice_max);
+       PN(se.statistics.wait_max);
+       PN(se.statistics.wait_sum);
+       P(se.statistics.wait_count);
+       PN(se.statistics.iowait_sum);
+       P(se.statistics.iowait_count);
+       P(se.nr_migrations);
+       P(se.statistics.nr_migrations_cold);
+       P(se.statistics.nr_failed_migrations_affine);
+       P(se.statistics.nr_failed_migrations_running);
+       P(se.statistics.nr_failed_migrations_hot);
+       P(se.statistics.nr_forced_migrations);
+       P(se.statistics.nr_wakeups);
+       P(se.statistics.nr_wakeups_sync);
+       P(se.statistics.nr_wakeups_migrate);
+       P(se.statistics.nr_wakeups_local);
+       P(se.statistics.nr_wakeups_remote);
+       P(se.statistics.nr_wakeups_affine);
+       P(se.statistics.nr_wakeups_affine_attempts);
+       P(se.statistics.nr_wakeups_passive);
+       P(se.statistics.nr_wakeups_idle);
+
+       {
+               u64 avg_atom, avg_per_cpu;
+
+               avg_atom = p->se.sum_exec_runtime;
+               if (nr_switches)
+                       do_div(avg_atom, nr_switches);
+               else
+                       avg_atom = -1LL;
+
+               avg_per_cpu = p->se.sum_exec_runtime;
+               if (p->se.nr_migrations) {
+                       avg_per_cpu = div64_u64(avg_per_cpu,
+                                               p->se.nr_migrations);
+               } else {
+                       avg_per_cpu = -1LL;
+               }
+
+               __PN(avg_atom);
+               __PN(avg_per_cpu);
+       }
+#endif
+       __P(nr_switches);
+       SEQ_printf(m, "%-35s:%21Ld\n",
+                  "nr_voluntary_switches", (long long)p->nvcsw);
+       SEQ_printf(m, "%-35s:%21Ld\n",
+                  "nr_involuntary_switches", (long long)p->nivcsw);
+
+       P(se.load.weight);
+       P(policy);
+       P(prio);
+#undef PN
+#undef __PN
+#undef P
+#undef __P
+
+       {
+               unsigned int this_cpu = raw_smp_processor_id();
+               u64 t0, t1;
+
+               t0 = cpu_clock(this_cpu);
+               t1 = cpu_clock(this_cpu);
+               SEQ_printf(m, "%-35s:%21Ld\n",
+                          "clock-delta", (long long)(t1-t0));
+       }
+}
+
+void proc_sched_set_task(struct task_struct *p)
+{
+#ifdef CONFIG_SCHEDSTATS
+       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
+#endif
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
new file mode 100644 (file)
index 0000000..8e42de9
--- /dev/null
@@ -0,0 +1,5592 @@
+/*
+ * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ *  Interactivity improvements by Mike Galbraith
+ *  (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ *  Various enhancements by Dmitry Adamushko.
+ *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ *  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  Copyright IBM Corporation, 2007
+ *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ *  Scaled math optimizations by Thomas Gleixner
+ *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ */
+
+#include <linux/latencytop.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+
+/*
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length
+ * and have no persistent notion like in traditional, time-slice
+ * based scheduling concepts.
+ *
+ * (to see the precise effective timeslice length of your workload,
+ *  run vmstat and monitor the context-switches (cs) field)
+ */
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+
+/*
+ * The initial- and re-scaling of tunables is configurable
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
+ *
+ * Options are:
+ * SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ */
+enum sched_tunable_scaling sysctl_sched_tunable_scaling
+       = SCHED_TUNABLESCALING_LOG;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ */
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+
+/*
+ * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ */
+static unsigned int sched_nr_latency = 8;
+
+/*
+ * After fork, child runs first. If set to 0 (default) then
+ * parent will (try to) run first.
+ */
+unsigned int sysctl_sched_child_runs_first __read_mostly;
+
+/*
+ * SCHED_OTHER wake-up granularity.
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ *
+ * This option delays the preemption effects of decoupled workloads
+ * and reduces their over-scheduling. Synchronous workloads will still
+ * have immediate wakeup/sleep latencies.
+ */
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+       unsigned int cpus = min_t(int, num_online_cpus(), 8);
+       unsigned int factor;
+
+       switch (sysctl_sched_tunable_scaling) {
+       case SCHED_TUNABLESCALING_NONE:
+               factor = 1;
+               break;
+       case SCHED_TUNABLESCALING_LINEAR:
+               factor = cpus;
+               break;
+       case SCHED_TUNABLESCALING_LOG:
+       default:
+               factor = 1 + ilog2(cpus);
+               break;
+       }
+
+       return factor;
+}
+
+static void update_sysctl(void)
+{
+       unsigned int factor = get_update_sysctl_factor();
+
+#define SET_SYSCTL(name) \
+       (sysctl_##name = (factor) * normalized_sysctl_##name)
+       SET_SYSCTL(sched_min_granularity);
+       SET_SYSCTL(sched_latency);
+       SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+
+void sched_init_granularity(void)
+{
+       update_sysctl();
+}
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST   (~0UL)
+#else
+# define WMULT_CONST   (1UL << 32)
+#endif
+
+#define WMULT_SHIFT    32
+
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+               struct load_weight *lw)
+{
+       u64 tmp;
+
+       /*
+        * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+        * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+        * 2^SCHED_LOAD_RESOLUTION.
+        */
+       if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+               tmp = (u64)delta_exec * scale_load_down(weight);
+       else
+               tmp = (u64)delta_exec;
+
+       if (!lw->inv_weight) {
+               unsigned long w = scale_load_down(lw->weight);
+
+               if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                       lw->inv_weight = 1;
+               else if (unlikely(!w))
+                       lw->inv_weight = WMULT_CONST;
+               else
+                       lw->inv_weight = WMULT_CONST / w;
+       }
+
+       /*
+        * Check whether we'd overflow the 64-bit multiplication:
+        */
+       if (unlikely(tmp > WMULT_CONST))
+               tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                       WMULT_SHIFT/2);
+       else
+               tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+
+       return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+
+
+const struct sched_class fair_sched_class;
+
+/**************************************************************
+ * CFS operations on generic schedulable entities:
+ */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->rq;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)     (!se->my_q)
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       WARN_ON_ONCE(!entity_is_task(se));
+#endif
+       return container_of(se, struct task_struct, se);
+}
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+               for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+       return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+       return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+       return grp->my_q;
+}
+
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+       if (se->cfs_rq == pse->cfs_rq)
+               return 1;
+
+       return 0;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+       return se->parent;
+}
+
+/* return depth at which a sched entity is present in the hierarchy */
+static inline int depth_se(struct sched_entity *se)
+{
+       int depth = 0;
+
+       for_each_sched_entity(se)
+               depth++;
+
+       return depth;
+}
+
+static void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+       int se_depth, pse_depth;
+
+       /*
+        * preemption test can be made between sibling entities who are in the
+        * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
+        * both tasks until we find their ancestors who are siblings of common
+        * parent.
+        */
+
+       /* First walk up until both entities are at same depth */
+       se_depth = depth_se(*se);
+       pse_depth = depth_se(*pse);
+
+       while (se_depth > pse_depth) {
+               se_depth--;
+               *se = parent_entity(*se);
+       }
+
+       while (pse_depth > se_depth) {
+               pse_depth--;
+               *pse = parent_entity(*pse);
+       }
+
+       while (!is_same_group(*se, *pse)) {
+               *se = parent_entity(*se);
+               *pse = parent_entity(*pse);
+       }
+}
+
+#else  /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline struct task_struct *task_of(struct sched_entity *se)
+{
+       return container_of(se, struct task_struct, se);
+}
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+       return container_of(cfs_rq, struct rq, cfs);
+}
+
+#define entity_is_task(se)     1
+
+#define for_each_sched_entity(se) \
+               for (; se; se = NULL)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+       return &task_rq(p)->cfs;
+}
+
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+       struct task_struct *p = task_of(se);
+       struct rq *rq = task_rq(p);
+
+       return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+       return NULL;
+}
+
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int
+is_same_group(struct sched_entity *se, struct sched_entity *pse)
+{
+       return 1;
+}
+
+static inline struct sched_entity *parent_entity(struct sched_entity *se)
+{
+       return NULL;
+}
+
+static inline void
+find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+{
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                  unsigned long delta_exec);
+
+/**************************************************************
+ * Scheduling class tree data structure manipulation methods:
+ */
+
+static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+{
+       s64 delta = (s64)(vruntime - min_vruntime);
+       if (delta > 0)
+               min_vruntime = vruntime;
+
+       return min_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+       s64 delta = (s64)(vruntime - min_vruntime);
+       if (delta < 0)
+               min_vruntime = vruntime;
+
+       return min_vruntime;
+}
+
+static inline int entity_before(struct sched_entity *a,
+                               struct sched_entity *b)
+{
+       return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
+static void update_min_vruntime(struct cfs_rq *cfs_rq)
+{
+       u64 vruntime = cfs_rq->min_vruntime;
+
+       if (cfs_rq->curr)
+               vruntime = cfs_rq->curr->vruntime;
+
+       if (cfs_rq->rb_leftmost) {
+               struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
+                                                  struct sched_entity,
+                                                  run_node);
+
+               if (!cfs_rq->curr)
+                       vruntime = se->vruntime;
+               else
+                       vruntime = min_vruntime(vruntime, se->vruntime);
+       }
+
+       cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+       smp_wmb();
+       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
+/*
+ * Enqueue an entity into the rb-tree:
+ */
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+       struct rb_node *parent = NULL;
+       struct sched_entity *entry;
+       int leftmost = 1;
+
+       /*
+        * Find the right place in the rbtree:
+        */
+       while (*link) {
+               parent = *link;
+               entry = rb_entry(parent, struct sched_entity, run_node);
+               /*
+                * We dont care about collisions. Nodes with
+                * the same key stay together.
+                */
+               if (entity_before(se, entry)) {
+                       link = &parent->rb_left;
+               } else {
+                       link = &parent->rb_right;
+                       leftmost = 0;
+               }
+       }
+
+       /*
+        * Maintain a cache of leftmost tree entries (it is frequently
+        * used):
+        */
+       if (leftmost)
+               cfs_rq->rb_leftmost = &se->run_node;
+
+       rb_link_node(&se->run_node, parent, link);
+       rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       if (cfs_rq->rb_leftmost == &se->run_node) {
+               struct rb_node *next_node;
+
+               next_node = rb_next(&se->run_node);
+               cfs_rq->rb_leftmost = next_node;
+       }
+
+       rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+}
+
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+{
+       struct rb_node *left = cfs_rq->rb_leftmost;
+
+       if (!left)
+               return NULL;
+
+       return rb_entry(left, struct sched_entity, run_node);
+}
+
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+       struct rb_node *next = rb_next(&se->run_node);
+
+       if (!next)
+               return NULL;
+
+       return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+{
+       struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
+
+       if (!last)
+               return NULL;
+
+       return rb_entry(last, struct sched_entity, run_node);
+}
+
+/**************************************************************
+ * Scheduling class statistics methods:
+ */
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       int factor = get_update_sysctl_factor();
+
+       if (ret || !write)
+               return ret;
+
+       sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
+                                       sysctl_sched_min_granularity);
+
+#define WRT_SYSCTL(name) \
+       (normalized_sysctl_##name = sysctl_##name / (factor))
+       WRT_SYSCTL(sched_min_granularity);
+       WRT_SYSCTL(sched_latency);
+       WRT_SYSCTL(sched_wakeup_granularity);
+#undef WRT_SYSCTL
+
+       return 0;
+}
+#endif
+
+/*
+ * delta /= w
+ */
+static inline unsigned long
+calc_delta_fair(unsigned long delta, struct sched_entity *se)
+{
+       if (unlikely(se->load.weight != NICE_0_LOAD))
+               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
+
+       return delta;
+}
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+       u64 period = sysctl_sched_latency;
+       unsigned long nr_latency = sched_nr_latency;
+
+       if (unlikely(nr_running > nr_latency)) {
+               period = sysctl_sched_min_granularity;
+               period *= nr_running;
+       }
+
+       return period;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*P[w/rw]
+ */
+static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+
+       for_each_sched_entity(se) {
+               struct load_weight *load;
+               struct load_weight lw;
+
+               cfs_rq = cfs_rq_of(se);
+               load = &cfs_rq->load;
+
+               if (unlikely(!se->on_rq)) {
+                       lw = cfs_rq->load;
+
+                       update_load_add(&lw, se->load.weight);
+                       load = &lw;
+               }
+               slice = calc_delta_mine(slice, se->load.weight, load);
+       }
+       return slice;
+}
+
+/*
+ * We calculate the vruntime slice of a to be inserted task
+ *
+ * vs = s/w
+ */
+static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       return calc_delta_fair(sched_slice(cfs_rq, se), se);
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void
+__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
+             unsigned long delta_exec)
+{
+       unsigned long delta_exec_weighted;
+
+       schedstat_set(curr->statistics.exec_max,
+                     max((u64)delta_exec, curr->statistics.exec_max));
+
+       curr->sum_exec_runtime += delta_exec;
+       schedstat_add(cfs_rq, exec_clock, delta_exec);
+       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+
+       curr->vruntime += delta_exec_weighted;
+       update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
+}
+
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *curr = cfs_rq->curr;
+       u64 now = rq_of(cfs_rq)->clock_task;
+       unsigned long delta_exec;
+
+       if (unlikely(!curr))
+               return;
+
+       /*
+        * Get the amount of time the current task was running
+        * since the last time we changed load (this cannot
+        * overflow on 32 bits):
+        */
+       delta_exec = (unsigned long)(now - curr->exec_start);
+       if (!delta_exec)
+               return;
+
+       __update_curr(cfs_rq, curr, delta_exec);
+       curr->exec_start = now;
+
+       if (entity_is_task(curr)) {
+               struct task_struct *curtask = task_of(curr);
+
+               trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+               cpuacct_charge(curtask, delta_exec);
+               account_group_exec_runtime(curtask, delta_exec);
+       }
+
+       account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       /*
+        * Are we enqueueing a waiting task? (for current tasks
+        * a dequeue/enqueue event is a NOP)
+        */
+       if (se != cfs_rq->curr)
+               update_stats_wait_start(cfs_rq, se);
+}
+
+static void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start));
+       schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+       schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
+#ifdef CONFIG_SCHEDSTATS
+       if (entity_is_task(se)) {
+               trace_sched_stat_wait(task_of(se),
+                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
+       }
+#endif
+       schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       /*
+        * Mark the end of the wait period if dequeueing a
+        * waiting task:
+        */
+       if (se != cfs_rq->curr)
+               update_stats_wait_end(cfs_rq, se);
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       /*
+        * We are starting a new run period:
+        */
+       se->exec_start = rq_of(cfs_rq)->clock_task;
+}
+
+/**************************************************
+ * Scheduling class queueing methods:
+ */
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+       cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
+static void
+account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       update_load_add(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
+       if (entity_is_task(se)) {
+               add_cfs_task_weight(cfs_rq, se->load.weight);
+               list_add(&se->group_node, &cfs_rq->tasks);
+       }
+       cfs_rq->nr_running++;
+}
+
+static void
+account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       update_load_sub(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+       if (entity_is_task(se)) {
+               add_cfs_task_weight(cfs_rq, -se->load.weight);
+               list_del_init(&se->group_node);
+       }
+       cfs_rq->nr_running--;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+# ifdef CONFIG_SMP
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
+               return;
+
+       now = rq_of(cfs_rq)->clock_task;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+               delta = period - 1;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+       long tg_weight;
+
+       /*
+        * Use this CPU's actual weight instead of the last load_contribution
+        * to gain a more accurate current total weight. See
+        * update_cfs_rq_load_contribution().
+        */
+       tg_weight = atomic_read(&tg->load_weight);
+       tg_weight -= cfs_rq->load_contribution;
+       tg_weight += cfs_rq->load.weight;
+
+       return tg_weight;
+}
+
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+       long tg_weight, load, shares;
+
+       tg_weight = calc_tg_weight(tg, cfs_rq);
+       load = cfs_rq->load.weight;
+
+       shares = (tg->shares * load);
+       if (tg_weight)
+               shares /= tg_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
+       }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+       return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long shares;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se || throttled_hierarchy(cfs_rq))
+               return;
+#ifndef CONFIG_SMP
+       if (likely(se->load.weight == tg->shares))
+               return;
+#endif
+       shares = calc_cfs_shares(cfs_rq, tg);
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+       struct task_struct *tsk = NULL;
+
+       if (entity_is_task(se))
+               tsk = task_of(se);
+
+       if (se->statistics.sleep_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > se->statistics.sleep_max))
+                       se->statistics.sleep_max = delta;
+
+               se->statistics.sum_sleep_runtime += delta;
+
+               if (tsk) {
+                       account_scheduler_latency(tsk, delta >> 10, 1);
+                       trace_sched_stat_sleep(tsk, delta);
+               }
+       }
+       if (se->statistics.block_start) {
+               u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               if (unlikely(delta > se->statistics.block_max))
+                       se->statistics.block_max = delta;
+
+               se->statistics.sum_sleep_runtime += delta;
+
+               if (tsk) {
+                       if (tsk->in_iowait) {
+                               se->statistics.iowait_sum += delta;
+                               se->statistics.iowait_count++;
+                               trace_sched_stat_iowait(tsk, delta);
+                       }
+
+                       trace_sched_stat_blocked(tsk, delta);
+
+                       /*
+                        * Blocking time is in units of nanosecs, so shift by
+                        * 20 to get a milliseconds-range estimation of the
+                        * amount of time that the task spent sleeping:
+                        */
+                       if (unlikely(prof_on == SLEEP_PROFILING)) {
+                               profile_hits(SLEEP_PROFILING,
+                                               (void *)get_wchan(tsk),
+                                               delta >> 20);
+                       }
+                       account_scheduler_latency(tsk, delta >> 10, 0);
+               }
+       }
+#endif
+}
+
+static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       s64 d = se->vruntime - cfs_rq->min_vruntime;
+
+       if (d < 0)
+               d = -d;
+
+       if (d > 3*sysctl_sched_latency)
+               schedstat_inc(cfs_rq, nr_spread_over);
+#endif
+}
+
+static void
+place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+{
+       u64 vruntime = cfs_rq->min_vruntime;
+
+       /*
+        * The 'current' period is already promised to the current tasks,
+        * however the extra weight of the new task will slow them down a
+        * little, place the new task so that it fits in the slot that
+        * stays open at the end.
+        */
+       if (initial && sched_feat(START_DEBIT))
+               vruntime += sched_vslice(cfs_rq, se);
+
+       /* sleeps up to a single latency don't count. */
+       if (!initial) {
+               unsigned long thresh = sysctl_sched_latency;
+
+               /*
+                * Halve their sleep time's effect, to allow
+                * for a gentler effect of sleepers:
+                */
+               if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                       thresh >>= 1;
+
+               vruntime -= thresh;
+       }
+
+       /* ensure we never gain time by being placed backwards. */
+       vruntime = max_vruntime(se->vruntime, vruntime);
+
+       se->vruntime = vruntime;
+}
+
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+       /*
+        * Update the normalized vruntime before updating min_vruntime
+        * through callig update_curr().
+        */
+       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+               se->vruntime += cfs_rq->min_vruntime;
+
+       /*
+        * Update run-time statistics of the 'current'.
+        */
+       update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       account_entity_enqueue(cfs_rq, se);
+       update_cfs_shares(cfs_rq);
+
+       if (flags & ENQUEUE_WAKEUP) {
+               place_entity(cfs_rq, se, 0);
+               enqueue_sleeper(cfs_rq, se);
+       }
+
+       update_stats_enqueue(cfs_rq, se);
+       check_spread(cfs_rq, se);
+       if (se != cfs_rq->curr)
+               __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1) {
+               list_add_leaf_cfs_rq(cfs_rq);
+               check_enqueue_throttle(cfs_rq);
+       }
+}
+
+static void __clear_buddies_last(struct sched_entity *se)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->last == se)
+                       cfs_rq->last = NULL;
+               else
+                       break;
+       }
+}
+
+static void __clear_buddies_next(struct sched_entity *se)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->next == se)
+                       cfs_rq->next = NULL;
+               else
+                       break;
+       }
+}
+
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+               if (cfs_rq->skip == se)
+                       cfs_rq->skip = NULL;
+               else
+                       break;
+       }
+}
+
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       if (cfs_rq->last == se)
+               __clear_buddies_last(se);
+
+       if (cfs_rq->next == se)
+               __clear_buddies_next(se);
+
+       if (cfs_rq->skip == se)
+               __clear_buddies_skip(se);
+}
+
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+       /*
+        * Update run-time statistics of the 'current'.
+        */
+       update_curr(cfs_rq);
+
+       update_stats_dequeue(cfs_rq, se);
+       if (flags & DEQUEUE_SLEEP) {
+#ifdef CONFIG_SCHEDSTATS
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       if (tsk->state & TASK_INTERRUPTIBLE)
+                               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
+                       if (tsk->state & TASK_UNINTERRUPTIBLE)
+                               se->statistics.block_start = rq_of(cfs_rq)->clock;
+               }
+#endif
+       }
+
+       clear_buddies(cfs_rq, se);
+
+       if (se != cfs_rq->curr)
+               __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
+       account_entity_dequeue(cfs_rq, se);
+
+       /*
+        * Normalize the entity after updating the min_vruntime because the
+        * update can refer to the ->curr item and we need to reflect this
+        * movement in our normalized position.
+        */
+       if (!(flags & DEQUEUE_SLEEP))
+               se->vruntime -= cfs_rq->min_vruntime;
+
+       /* return excess runtime on last dequeue */
+       return_cfs_rq_runtime(cfs_rq);
+
+       update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq);
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+       unsigned long ideal_runtime, delta_exec;
+       struct sched_entity *se;
+       s64 delta;
+
+       ideal_runtime = sched_slice(cfs_rq, curr);
+       delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+       if (delta_exec > ideal_runtime) {
+               resched_task(rq_of(cfs_rq)->curr);
+               /*
+                * The current task ran long enough, ensure it doesn't get
+                * re-elected due to buddy favours.
+                */
+               clear_buddies(cfs_rq, curr);
+               return;
+       }
+
+       /*
+        * Ensure that a task that missed wakeup preemption by a
+        * narrow margin doesn't have to wait for a full slice.
+        * This also mitigates buddy induced latencies under load.
+        */
+       if (delta_exec < sysctl_sched_min_granularity)
+               return;
+
+       se = __pick_first_entity(cfs_rq);
+       delta = curr->vruntime - se->vruntime;
+
+       if (delta < 0)
+               return;
+
+       if (delta > ideal_runtime)
+               resched_task(rq_of(cfs_rq)->curr);
+}
+
+static void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+       /* 'current' is not kept within the tree. */
+       if (se->on_rq) {
+               /*
+                * Any task has to be enqueued before it get to execute on
+                * a CPU. So account for the time it spent waiting on the
+                * runqueue.
+                */
+               update_stats_wait_end(cfs_rq, se);
+               __dequeue_entity(cfs_rq, se);
+       }
+
+       update_stats_curr_start(cfs_rq, se);
+       cfs_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+       /*
+        * Track our maximum slice length, if the CPU's load is at
+        * least twice that of our own weight (i.e. dont track it
+        * when there are only lesser-weight tasks around):
+        */
+       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+               se->statistics.slice_max = max(se->statistics.slice_max,
+                       se->sum_exec_runtime - se->prev_sum_exec_runtime);
+       }
+#endif
+       se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *se = __pick_first_entity(cfs_rq);
+       struct sched_entity *left = se;
+
+       /*
+        * Avoid running the skip buddy, if running something else can
+        * be done without getting too unfair.
+        */
+       if (cfs_rq->skip == se) {
+               struct sched_entity *second = __pick_next_entity(se);
+               if (second && wakeup_preempt_entity(second, left) < 1)
+                       se = second;
+       }
+
+       /*
+        * Prefer last buddy, try to return the CPU to a preempted task.
+        */
+       if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+               se = cfs_rq->last;
+
+       /*
+        * Someone really wants this to run. If it's not unfair, run it.
+        */
+       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+               se = cfs_rq->next;
+
+       clear_buddies(cfs_rq, se);
+
+       return se;
+}
+
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+{
+       /*
+        * If still on the runqueue then deactivate_task()
+        * was not called and update_curr() has to be done:
+        */
+       if (prev->on_rq)
+               update_curr(cfs_rq);
+
+       /* throttle cfs_rqs exceeding runtime */
+       check_cfs_rq_runtime(cfs_rq);
+
+       check_spread(cfs_rq, prev);
+       if (prev->on_rq) {
+               update_stats_wait_start(cfs_rq, prev);
+               /* Put 'current' back into the tree. */
+               __enqueue_entity(cfs_rq, prev);
+       }
+       cfs_rq->curr = NULL;
+}
+
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+{
+       /*
+        * Update run-time statistics of the 'current'.
+        */
+       update_curr(cfs_rq);
+
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+       /*
+        * queued ticks are scheduled to match the slice, so don't bother
+        * validating it and just reschedule.
+        */
+       if (queued) {
+               resched_task(rq_of(cfs_rq)->curr);
+               return;
+       }
+       /*
+        * don't let the period tick interfere with the hrtick preemption
+        */
+       if (!sched_feat(DOUBLE_TICK) &&
+                       hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+               return;
+#endif
+
+       if (cfs_rq->nr_running > 1)
+               check_preempt_tick(cfs_rq, curr);
+}
+
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+       return static_branch(&__cfs_bandwidth_used);
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+       /* only need to count groups transitioning between enabled/!enabled */
+       if (enabled && !was_enabled)
+               jump_label_inc(&__cfs_bandwidth_used);
+       else if (!enabled && was_enabled)
+               jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+       return true;
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+       return 100000000ULL;
+}
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+       return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+       u64 now;
+
+       if (cfs_b->quota == RUNTIME_INF)
+               return;
+
+       now = sched_clock_cpu(smp_processor_id());
+       cfs_b->runtime = cfs_b->quota;
+       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return &tg->cfs_bandwidth;
+}
+
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct task_group *tg = cfs_rq->tg;
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+       u64 amount = 0, min_amount, expires;
+
+       /* note: this is a positive sum as runtime_remaining <= 0 */
+       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+       raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->quota == RUNTIME_INF)
+               amount = min_amount;
+       else {
+               /*
+                * If the bandwidth pool has become inactive, then at least one
+                * period must have elapsed since the last consumption.
+                * Refresh the global state and ensure bandwidth timer becomes
+                * active.
+                */
+               if (!cfs_b->timer_active) {
+                       __refill_cfs_bandwidth_runtime(cfs_b);
+                       __start_cfs_bandwidth(cfs_b);
+               }
+
+               if (cfs_b->runtime > 0) {
+                       amount = min(cfs_b->runtime, min_amount);
+                       cfs_b->runtime -= amount;
+                       cfs_b->idle = 0;
+               }
+       }
+       expires = cfs_b->runtime_expires;
+       raw_spin_unlock(&cfs_b->lock);
+
+       cfs_rq->runtime_remaining += amount;
+       /*
+        * we may have advanced our local expiration to account for allowed
+        * spread between our sched_clock and the one on which runtime was
+        * issued.
+        */
+       if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+               cfs_rq->runtime_expires = expires;
+
+       return cfs_rq->runtime_remaining > 0;
+}
+
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       struct rq *rq = rq_of(cfs_rq);
+
+       /* if the deadline is ahead of our clock, nothing to do */
+       if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+               return;
+
+       if (cfs_rq->runtime_remaining < 0)
+               return;
+
+       /*
+        * If the local deadline has passed we have to consider the
+        * possibility that our sched_clock is 'fast' and the global deadline
+        * has not truly expired.
+        *
+        * Fortunately we can check determine whether this the case by checking
+        * whether the global deadline has advanced.
+        */
+
+       if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+               /* extend local deadline, drift is bounded above by 2 ticks */
+               cfs_rq->runtime_expires += TICK_NSEC;
+       } else {
+               /* global deadline is ahead, expiration has passed */
+               cfs_rq->runtime_remaining = 0;
+       }
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                    unsigned long delta_exec)
+{
+       /* dock delta_exec before expiring quota (as it could span periods) */
+       cfs_rq->runtime_remaining -= delta_exec;
+       expire_cfs_rq_runtime(cfs_rq);
+
+       if (likely(cfs_rq->runtime_remaining > 0))
+               return;
+
+       /*
+        * if we're unable to extend our runtime we resched so that the active
+        * hierarchy can be throttled
+        */
+       if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+               resched_task(rq_of(cfs_rq)->curr);
+}
+
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                                  unsigned long delta_exec)
+{
+       if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
+               return;
+
+       __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+       return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+       return cfs_bandwidth_used() && cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+                                   int src_cpu, int dest_cpu)
+{
+       struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+       src_cfs_rq = tg->cfs_rq[src_cpu];
+       dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+       return throttled_hierarchy(src_cfs_rq) ||
+              throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+       struct rq *rq = data;
+       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+       cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+       if (!cfs_rq->throttle_count) {
+               u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+               /* leaving throttled state, advance shares averaging windows */
+               cfs_rq->load_stamp += delta;
+               cfs_rq->load_last += delta;
+
+               /* update entity weight now that we are on_rq again */
+               update_cfs_shares(cfs_rq);
+       }
+#endif
+
+       return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+       struct rq *rq = data;
+       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+       /* group is entering throttled state, record last load */
+       if (!cfs_rq->throttle_count)
+               update_cfs_load(cfs_rq, 0);
+       cfs_rq->throttle_count++;
+
+       return 0;
+}
+
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       struct rq *rq = rq_of(cfs_rq);
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       struct sched_entity *se;
+       long task_delta, dequeue = 1;
+
+       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+       /* account load preceding throttle */
+       rcu_read_lock();
+       walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+       rcu_read_unlock();
+
+       task_delta = cfs_rq->h_nr_running;
+       for_each_sched_entity(se) {
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+               /* throttled entity or throttle-on-deactivate */
+               if (!se->on_rq)
+                       break;
+
+               if (dequeue)
+                       dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+               qcfs_rq->h_nr_running -= task_delta;
+
+               if (qcfs_rq->load.weight)
+                       dequeue = 0;
+       }
+
+       if (!se)
+               rq->nr_running -= task_delta;
+
+       cfs_rq->throttled = 1;
+       cfs_rq->throttled_timestamp = rq->clock;
+       raw_spin_lock(&cfs_b->lock);
+       list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       raw_spin_unlock(&cfs_b->lock);
+}
+
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       struct rq *rq = rq_of(cfs_rq);
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       struct sched_entity *se;
+       int enqueue = 1;
+       long task_delta;
+
+       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+       cfs_rq->throttled = 0;
+       raw_spin_lock(&cfs_b->lock);
+       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+       list_del_rcu(&cfs_rq->throttled_list);
+       raw_spin_unlock(&cfs_b->lock);
+       cfs_rq->throttled_timestamp = 0;
+
+       update_rq_clock(rq);
+       /* update hierarchical throttle state */
+       walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+       if (!cfs_rq->load.weight)
+               return;
+
+       task_delta = cfs_rq->h_nr_running;
+       for_each_sched_entity(se) {
+               if (se->on_rq)
+                       enqueue = 0;
+
+               cfs_rq = cfs_rq_of(se);
+               if (enqueue)
+                       enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+               cfs_rq->h_nr_running += task_delta;
+
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+       }
+
+       if (!se)
+               rq->nr_running += task_delta;
+
+       /* determine whether we need to wake up potentially idle cpu */
+       if (rq->curr == rq->idle && rq->cfs.nr_running)
+               resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+               u64 remaining, u64 expires)
+{
+       struct cfs_rq *cfs_rq;
+       u64 runtime = remaining;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+                               throttled_list) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               raw_spin_lock(&rq->lock);
+               if (!cfs_rq_throttled(cfs_rq))
+                       goto next;
+
+               runtime = -cfs_rq->runtime_remaining + 1;
+               if (runtime > remaining)
+                       runtime = remaining;
+               remaining -= runtime;
+
+               cfs_rq->runtime_remaining += runtime;
+               cfs_rq->runtime_expires = expires;
+
+               /* we check whether we're throttled above */
+               if (cfs_rq->runtime_remaining > 0)
+                       unthrottle_cfs_rq(cfs_rq);
+
+next:
+               raw_spin_unlock(&rq->lock);
+
+               if (!remaining)
+                       break;
+       }
+       rcu_read_unlock();
+
+       return remaining;
+}
+
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+       u64 runtime, runtime_expires;
+       int idle = 1, throttled;
+
+       raw_spin_lock(&cfs_b->lock);
+       /* no need to continue the timer with no bandwidth constraint */
+       if (cfs_b->quota == RUNTIME_INF)
+               goto out_unlock;
+
+       throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+       /* idle depends on !throttled (for the case of a large deficit) */
+       idle = cfs_b->idle && !throttled;
+       cfs_b->nr_periods += overrun;
+
+       /* if we're going inactive then everything else can be deferred */
+       if (idle)
+               goto out_unlock;
+
+       __refill_cfs_bandwidth_runtime(cfs_b);
+
+       if (!throttled) {
+               /* mark as potentially idle for the upcoming period */
+               cfs_b->idle = 1;
+               goto out_unlock;
+       }
+
+       /* account preceding periods in which throttling occurred */
+       cfs_b->nr_throttled += overrun;
+
+       /*
+        * There are throttled entities so we must first use the new bandwidth
+        * to unthrottle them before making it generally available.  This
+        * ensures that all existing debts will be paid before a new cfs_rq is
+        * allowed to run.
+        */
+       runtime = cfs_b->runtime;
+       runtime_expires = cfs_b->runtime_expires;
+       cfs_b->runtime = 0;
+
+       /*
+        * This check is repeated as we are holding onto the new bandwidth
+        * while we unthrottle.  This can potentially race with an unthrottled
+        * group trying to acquire new bandwidth from the global pool.
+        */
+       while (throttled && runtime > 0) {
+               raw_spin_unlock(&cfs_b->lock);
+               /* we can't nest cfs_b->lock while distributing bandwidth */
+               runtime = distribute_cfs_runtime(cfs_b, runtime,
+                                                runtime_expires);
+               raw_spin_lock(&cfs_b->lock);
+
+               throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+       }
+
+       /* return (any) remaining runtime */
+       cfs_b->runtime = runtime;
+       /*
+        * While we are ensured activity in the period following an
+        * unthrottle, this also covers the case in which the new bandwidth is
+        * insufficient to cover the existing bandwidth deficit.  (Forcing the
+        * timer to remain active while there are any throttled entities.)
+        */
+       cfs_b->idle = 0;
+out_unlock:
+       if (idle)
+               cfs_b->timer_active = 0;
+       raw_spin_unlock(&cfs_b->lock);
+
+       return idle;
+}
+
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+       struct hrtimer *refresh_timer = &cfs_b->period_timer;
+       u64 remaining;
+
+       /* if the call-back is running a quota refresh is already occurring */
+       if (hrtimer_callback_running(refresh_timer))
+               return 1;
+
+       /* is a quota refresh about to occur? */
+       remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+       if (remaining < min_expire)
+               return 1;
+
+       return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+       /* if there's a quota refresh soon don't bother with slack */
+       if (runtime_refresh_within(cfs_b, min_left))
+               return;
+
+       start_bandwidth_timer(&cfs_b->slack_timer,
+                               ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+       s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+       if (slack_runtime <= 0)
+               return;
+
+       raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->quota != RUNTIME_INF &&
+           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+               cfs_b->runtime += slack_runtime;
+
+               /* we are under rq->lock, defer unthrottling using a timer */
+               if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                   !list_empty(&cfs_b->throttled_cfs_rq))
+                       start_cfs_slack_bandwidth(cfs_b);
+       }
+       raw_spin_unlock(&cfs_b->lock);
+
+       /* even if it's not valid for return we don't want to try again */
+       cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_bandwidth_used())
+               return;
+
+       if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+               return;
+
+       __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+       u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+       u64 expires;
+
+       /* confirm we're still not at a refresh boundary */
+       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+               return;
+
+       raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+               runtime = cfs_b->runtime;
+               cfs_b->runtime = 0;
+       }
+       expires = cfs_b->runtime_expires;
+       raw_spin_unlock(&cfs_b->lock);
+
+       if (!runtime)
+               return;
+
+       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+       raw_spin_lock(&cfs_b->lock);
+       if (expires == cfs_b->runtime_expires)
+               cfs_b->runtime = runtime;
+       raw_spin_unlock(&cfs_b->lock);
+}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_bandwidth_used())
+               return;
+
+       /* an active group must be handled by the update_curr()->put() path */
+       if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+               return;
+
+       /* ensure the group is not already throttled */
+       if (cfs_rq_throttled(cfs_rq))
+               return;
+
+       /* update runtime allocation */
+       account_cfs_rq_runtime(cfs_rq, 0);
+       if (cfs_rq->runtime_remaining <= 0)
+               throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_bandwidth_used())
+               return;
+
+       if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+               return;
+
+       /*
+        * it's possible for a throttled entity to be forced into a running
+        * state (e.g. set_curr_task), in this case we're finished.
+        */
+       if (cfs_rq_throttled(cfs_rq))
+               return;
+
+       throttle_cfs_rq(cfs_rq);
+}
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+       struct cfs_bandwidth *cfs_b =
+               container_of(timer, struct cfs_bandwidth, slack_timer);
+       do_sched_cfs_slack_timer(cfs_b);
+
+       return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+       struct cfs_bandwidth *cfs_b =
+               container_of(timer, struct cfs_bandwidth, period_timer);
+       ktime_t now;
+       int overrun;
+       int idle = 0;
+
+       for (;;) {
+               now = hrtimer_cb_get_time(timer);
+               overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+               if (!overrun)
+                       break;
+
+               idle = do_sched_cfs_period_timer(cfs_b, overrun);
+       }
+
+       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       raw_spin_lock_init(&cfs_b->lock);
+       cfs_b->runtime = 0;
+       cfs_b->quota = RUNTIME_INF;
+       cfs_b->period = ns_to_ktime(default_cfs_period());
+
+       INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+       hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       cfs_b->period_timer.function = sched_cfs_period_timer;
+       hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->runtime_enabled = 0;
+       INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       /*
+        * The timer may be active because we're trying to set a new bandwidth
+        * period or because we're racing with the tear-down path
+        * (timer_active==0 becomes visible before the hrtimer call-back
+        * terminates).  In either case we ensure that it's re-programmed
+        */
+       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+               raw_spin_unlock(&cfs_b->lock);
+               /* ensure cfs_b->lock is available while we wait */
+               hrtimer_cancel(&cfs_b->period_timer);
+
+               raw_spin_lock(&cfs_b->lock);
+               /* if someone else restarted the timer then we're done */
+               if (cfs_b->timer_active)
+                       return;
+       }
+
+       cfs_b->timer_active = 1;
+       start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+       hrtimer_cancel(&cfs_b->period_timer);
+       hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+       struct cfs_rq *cfs_rq;
+
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+               if (!cfs_rq->runtime_enabled)
+                       continue;
+
+               /*
+                * clock_task is not advancing so we just need to make sure
+                * there's some valid quota amount
+                */
+               cfs_rq->runtime_remaining = cfs_b->quota;
+               if (cfs_rq_throttled(cfs_rq))
+                       unthrottle_cfs_rq(cfs_rq);
+       }
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                    unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+       return 0;
+}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+       return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+                                   int src_cpu, int dest_cpu)
+{
+       return 0;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+#endif
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+       return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+/**************************************************
+ * CFS operations on tasks:
+ */
+
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       WARN_ON(task_rq(p) != rq);
+
+       if (cfs_rq->nr_running > 1) {
+               u64 slice = sched_slice(cfs_rq, se);
+               u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+               s64 delta = slice - ran;
+
+               if (delta < 0) {
+                       if (rq->curr == p)
+                               resched_task(p);
+                       return;
+               }
+
+               /*
+                * Don't schedule slices shorter than 10000ns, that just
+                * doesn't make sense. Rely on vruntime for fairness.
+                */
+               if (rq->curr != p)
+                       delta = max_t(s64, 10000LL, delta);
+
+               hrtick_start(rq, delta);
+       }
+}
+
+/*
+ * called from enqueue/dequeue and updates the hrtick when the
+ * current task is from our class and nr_running is low enough
+ * to matter.
+ */
+static void hrtick_update(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+
+       if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
+               return;
+
+       if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+               hrtick_start_fair(rq, curr);
+}
+#else /* !CONFIG_SCHED_HRTICK */
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void hrtick_update(struct rq *rq)
+{
+}
+#endif
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se;
+
+       for_each_sched_entity(se) {
+               if (se->on_rq)
+                       break;
+               cfs_rq = cfs_rq_of(se);
+               enqueue_entity(cfs_rq, se, flags);
+
+               /*
+                * end evaluation on encountering a throttled cfs_rq
+                *
+                * note: in the case of encountering a throttled cfs_rq we will
+                * post the final h_nr_running increment below.
+               */
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+               cfs_rq->h_nr_running++;
+
+               flags = ENQUEUE_WAKEUP;
+       }
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               cfs_rq->h_nr_running++;
+
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
+       }
+
+       if (!se)
+               inc_nr_running(rq);
+       hrtick_update(rq);
+}
+
+static void set_next_buddy(struct sched_entity *se);
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se;
+       int task_sleep = flags & DEQUEUE_SLEEP;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               dequeue_entity(cfs_rq, se, flags);
+
+               /*
+                * end evaluation on encountering a throttled cfs_rq
+                *
+                * note: in the case of encountering a throttled cfs_rq we will
+                * post the final h_nr_running decrement below.
+               */
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+               cfs_rq->h_nr_running--;
+
+               /* Don't dequeue parent if it has other entities besides us */
+               if (cfs_rq->load.weight) {
+                       /*
+                        * Bias pick_next to pick a task from this cfs_rq, as
+                        * p is sleeping when it is within its sched_slice.
+                        */
+                       if (task_sleep && parent_entity(se))
+                               set_next_buddy(parent_entity(se));
+
+                       /* avoid re-evaluating load for this entity */
+                       se = parent_entity(se);
+                       break;
+               }
+               flags |= DEQUEUE_SLEEP;
+       }
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               cfs_rq->h_nr_running--;
+
+               if (cfs_rq_throttled(cfs_rq))
+                       break;
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq);
+       }
+
+       if (!se)
+               dec_nr_running(rq);
+       hrtick_update(rq);
+}
+
+#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+       return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long total = weighted_cpuload(cpu);
+
+       if (type == 0 || !sched_feat(LB_BIAS))
+               return total;
+
+       return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long total = weighted_cpuload(cpu);
+
+       if (type == 0 || !sched_feat(LB_BIAS))
+               return total;
+
+       return max(rq->cpu_load[type-1], total);
+}
+
+static unsigned long power_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_power;
+}
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+
+       if (nr_running)
+               return rq->load.weight / nr_running;
+
+       return 0;
+}
+
+
+static void task_waking_fair(struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+
+       do {
+               min_vruntime_copy = cfs_rq->min_vruntime_copy;
+               smp_rmb();
+               min_vruntime = cfs_rq->min_vruntime;
+       } while (min_vruntime != min_vruntime_copy);
+#else
+       min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+       se->vruntime -= min_vruntime;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ *   s_i = rw_i / \Sum rw_j                                            (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ *   rw_i = {   2,   4,   1,   0 }
+ *   s_i  = { 2/7, 4/7, 1/7,   0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                           (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ *   rw'_i = {   3,   4,   1,   0 }
+ *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ *   dw_i = S * (s'_i - s_i)                                           (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
+ */
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+{
+       struct sched_entity *se = tg->se[cpu];
+
+       if (!tg->parent)        /* the trivial, non-cgroup case */
+               return wl;
+
+       for_each_sched_entity(se) {
+               long w, W;
+
+               tg = se->my_q->tg;
+
+               /*
+                * W = @wg + \Sum rw_j
+                */
+               W = wg + calc_tg_weight(tg, se->my_q);
+
+               /*
+                * w = rw_i + @wl
+                */
+               w = se->my_q->load.weight + wl;
+
+               /*
+                * wl = S * s'_i; see (2)
+                */
+               if (W > 0 && w < W)
+                       wl = (w * tg->shares) / W;
+               else
+                       wl = tg->shares;
+
+               /*
+                * Per the above, wl is the new se->load.weight value; since
+                * those are clipped to [MIN_SHARES, ...) do so now. See
+                * calc_cfs_shares().
+                */
+               if (wl < MIN_SHARES)
+                       wl = MIN_SHARES;
+
+               /*
+                * wl = dw_i = S * (s'_i - s_i); see (3)
+                */
+               wl -= se->load.weight;
+
+               /*
+                * Recursively apply this logic to all parent groups to compute
+                * the final effective load change on the root group. Since
+                * only the @tg group gets extra weight, all parent groups can
+                * only redistribute existing shares. @wl is the shift in shares
+                * resulting from this level per the above.
+                */
+               wg = 0;
+       }
+
+       return wl;
+}
+#else
+
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+               unsigned long wl, unsigned long wg)
+{
+       return wl;
+}
+
+#endif
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+{
+       s64 this_load, load;
+       int idx, this_cpu, prev_cpu;
+       unsigned long tl_per_task;
+       struct task_group *tg;
+       unsigned long weight;
+       int balanced;
+
+       idx       = sd->wake_idx;
+       this_cpu  = smp_processor_id();
+       prev_cpu  = task_cpu(p);
+       load      = source_load(prev_cpu, idx);
+       this_load = target_load(this_cpu, idx);
+
+       /*
+        * If sync wakeup then subtract the (maximum possible)
+        * effect of the currently running task from the load
+        * of the current CPU:
+        */
+       if (sync) {
+               tg = task_group(current);
+               weight = current->se.load.weight;
+
+               this_load += effective_load(tg, this_cpu, -weight, -weight);
+               load += effective_load(tg, prev_cpu, 0, -weight);
+       }
+
+       tg = task_group(p);
+       weight = p->se.load.weight;
+
+       /*
+        * In low-load situations, where prev_cpu is idle and this_cpu is idle
+        * due to the sync cause above having dropped this_load to 0, we'll
+        * always have an imbalance, but there's really nothing you can do
+        * about that, so that's good too.
+        *
+        * Otherwise check if either cpus are near enough in load to allow this
+        * task to be woken on this_cpu.
+        */
+       if (this_load > 0) {
+               s64 this_eff_load, prev_eff_load;
+
+               this_eff_load = 100;
+               this_eff_load *= power_of(prev_cpu);
+               this_eff_load *= this_load +
+                       effective_load(tg, this_cpu, weight, weight);
+
+               prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+               prev_eff_load *= power_of(this_cpu);
+               prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+
+               balanced = this_eff_load <= prev_eff_load;
+       } else
+               balanced = true;
+
+       /*
+        * If the currently running task will sleep within
+        * a reasonable amount of time then attract this newly
+        * woken task:
+        */
+       if (sync && balanced)
+               return 1;
+
+       schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+       tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+       if (balanced ||
+           (this_load <= load &&
+            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
+               /*
+                * This domain has SD_WAKE_AFFINE and
+                * p is cache cold in this domain, and
+                * there is no bad imbalance.
+                */
+               schedstat_inc(sd, ttwu_move_affine);
+               schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int load_idx)
+{
+       struct sched_group *idlest = NULL, *group = sd->groups;
+       unsigned long min_load = ULONG_MAX, this_load = 0;
+       int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+       do {
+               unsigned long load, avg_load;
+               int local_group;
+               int i;
+
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       tsk_cpus_allowed(p)))
+                       continue;
+
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+
+               /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
+
+               for_each_cpu(i, sched_group_cpus(group)) {
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = source_load(i, load_idx);
+                       else
+                               load = target_load(i, load_idx);
+
+                       avg_load += load;
+               }
+
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+
+               if (local_group) {
+                       this_load = avg_load;
+               } else if (avg_load < min_load) {
+                       min_load = avg_load;
+                       idlest = group;
+               }
+       } while (group = group->next, group != sd->groups);
+
+       if (!idlest || 100*this_load < imbalance*min_load)
+               return NULL;
+       return idlest;
+}
+
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+       unsigned long load, min_load = ULONG_MAX;
+       int idlest = -1;
+       int i;
+
+       /* Traverse only the allowed CPUs */
+       for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+               load = weighted_cpuload(i);
+
+               if (load < min_load || (load == min_load && i == this_cpu)) {
+                       min_load = load;
+                       idlest = i;
+               }
+       }
+
+       return idlest;
+}
+
+/*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+static int select_idle_sibling(struct task_struct *p, int target)
+{
+       int cpu = smp_processor_id();
+       int prev_cpu = task_cpu(p);
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int i;
+
+       /*
+        * If the task is going to be woken-up on this cpu and if it is
+        * already idle, then it is the right target.
+        */
+       if (target == cpu && idle_cpu(cpu))
+               return cpu;
+
+       /*
+        * If the task is going to be woken-up on the cpu where it previously
+        * ran and if it is currently idle, then it the right target.
+        */
+       if (target == prev_cpu && idle_cpu(prev_cpu))
+               return prev_cpu;
+
+       /*
+        * Otherwise, iterate the domains and find an elegible idle cpu.
+        */
+       rcu_read_lock();
+
+       sd = rcu_dereference(per_cpu(sd_llc, target));
+       for_each_lower_domain(sd) {
+               sg = sd->groups;
+               do {
+                       if (!cpumask_intersects(sched_group_cpus(sg),
+                                               tsk_cpus_allowed(p)))
+                               goto next;
+
+                       for_each_cpu(i, sched_group_cpus(sg)) {
+                               if (!idle_cpu(i))
+                                       goto next;
+                       }
+
+                       target = cpumask_first_and(sched_group_cpus(sg),
+                                       tsk_cpus_allowed(p));
+                       goto done;
+next:
+                       sg = sg->next;
+               } while (sg != sd->groups);
+       }
+done:
+       rcu_read_unlock();
+
+       return target;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+{
+       struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+       int cpu = smp_processor_id();
+       int prev_cpu = task_cpu(p);
+       int new_cpu = cpu;
+       int want_affine = 0;
+       int want_sd = 1;
+       int sync = wake_flags & WF_SYNC;
+
+       if (p->rt.nr_cpus_allowed == 1)
+               return prev_cpu;
+
+       if (sd_flag & SD_BALANCE_WAKE) {
+               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                       want_affine = 1;
+               new_cpu = prev_cpu;
+       }
+
+       rcu_read_lock();
+       for_each_domain(cpu, tmp) {
+               if (!(tmp->flags & SD_LOAD_BALANCE))
+                       continue;
+
+               /*
+                * If power savings logic is enabled for a domain, see if we
+                * are not overloaded, if so, don't balance wider.
+                */
+               if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                       unsigned long power = 0;
+                       unsigned long nr_running = 0;
+                       unsigned long capacity;
+                       int i;
+
+                       for_each_cpu(i, sched_domain_span(tmp)) {
+                               power += power_of(i);
+                               nr_running += cpu_rq(i)->cfs.nr_running;
+                       }
+
+                       capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
+
+                       if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                               nr_running /= 2;
+
+                       if (nr_running < capacity)
+                               want_sd = 0;
+               }
+
+               /*
+                * If both cpu and prev_cpu are part of this domain,
+                * cpu is a valid SD_WAKE_AFFINE target.
+                */
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                       affine_sd = tmp;
+                       want_affine = 0;
+               }
+
+               if (!want_sd && !want_affine)
+                       break;
+
+               if (!(tmp->flags & sd_flag))
+                       continue;
+
+               if (want_sd)
+                       sd = tmp;
+       }
+
+       if (affine_sd) {
+               if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+                       prev_cpu = cpu;
+
+               new_cpu = select_idle_sibling(p, prev_cpu);
+               goto unlock;
+       }
+
+       while (sd) {
+               int load_idx = sd->forkexec_idx;
+               struct sched_group *group;
+               int weight;
+
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               if (sd_flag & SD_BALANCE_WAKE)
+                       load_idx = sd->wake_idx;
+
+               group = find_idlest_group(sd, p, cpu, load_idx);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               new_cpu = find_idlest_cpu(group, p, cpu);
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
+
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = sd->span_weight;
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= tmp->span_weight)
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+unlock:
+       rcu_read_unlock();
+
+       return new_cpu;
+}
+#endif /* CONFIG_SMP */
+
+static unsigned long
+wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+       unsigned long gran = sysctl_sched_wakeup_granularity;
+
+       /*
+        * Since its curr running now, convert the gran from real-time
+        * to virtual-time in his units.
+        *
+        * By using 'se' instead of 'curr' we penalize light tasks, so
+        * they get preempted easier. That is, if 'se' < 'curr' then
+        * the resulting gran will be larger, therefore penalizing the
+        * lighter, if otoh 'se' > 'curr' then the resulting gran will
+        * be smaller, again penalizing the lighter task.
+        *
+        * This is especially important for buddies when the leftmost
+        * task is higher priority than the buddy.
+        */
+       return calc_delta_fair(gran, se);
+}
+
+/*
+ * Should 'se' preempt 'curr'.
+ *
+ *             |s1
+ *        |s2
+ *   |s3
+ *         g
+ *      |<--->|c
+ *
+ *  w(c, s1) = -1
+ *  w(c, s2) =  0
+ *  w(c, s3) =  1
+ *
+ */
+static int
+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+{
+       s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+       if (vdiff <= 0)
+               return -1;
+
+       gran = wakeup_gran(curr, se);
+       if (vdiff > gran)
+               return 1;
+
+       return 0;
+}
+
+static void set_last_buddy(struct sched_entity *se)
+{
+       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+               return;
+
+       for_each_sched_entity(se)
+               cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+               return;
+
+       for_each_sched_entity(se)
+               cfs_rq_of(se)->next = se;
+}
+
+static void set_skip_buddy(struct sched_entity *se)
+{
+       for_each_sched_entity(se)
+               cfs_rq_of(se)->skip = se;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+       struct task_struct *curr = rq->curr;
+       struct sched_entity *se = &curr->se, *pse = &p->se;
+       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       int scale = cfs_rq->nr_running >= sched_nr_latency;
+       int next_buddy_marked = 0;
+
+       if (unlikely(se == pse))
+               return;
+
+       /*
+        * This is possible from callers such as pull_task(), in which we
+        * unconditionally check_prempt_curr() after an enqueue (which may have
+        * lead to a throttle).  This both saves work and prevents false
+        * next-buddy nomination below.
+        */
+       if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+               return;
+
+       if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+               set_next_buddy(pse);
+               next_buddy_marked = 1;
+       }
+
+       /*
+        * We can come here with TIF_NEED_RESCHED already set from new task
+        * wake up path.
+        *
+        * Note: this also catches the edge-case of curr being in a throttled
+        * group (e.g. via set_curr_task), since update_curr() (in the
+        * enqueue of curr) will have resulted in resched being set.  This
+        * prevents us from potentially nominating it as a false LAST_BUDDY
+        * below.
+        */
+       if (test_tsk_need_resched(curr))
+               return;
+
+       /* Idle tasks are by definition preempted by non-idle tasks. */
+       if (unlikely(curr->policy == SCHED_IDLE) &&
+           likely(p->policy != SCHED_IDLE))
+               goto preempt;
+
+       /*
+        * Batch and idle tasks do not preempt non-idle tasks (their preemption
+        * is driven by the tick):
+        */
+       if (unlikely(p->policy != SCHED_NORMAL))
+               return;
+
+       find_matching_se(&se, &pse);
+       update_curr(cfs_rq_of(se));
+       BUG_ON(!pse);
+       if (wakeup_preempt_entity(se, pse) == 1) {
+               /*
+                * Bias pick_next to pick the sched entity that is
+                * triggering this preemption.
+                */
+               if (!next_buddy_marked)
+                       set_next_buddy(pse);
+               goto preempt;
+       }
+
+       return;
+
+preempt:
+       resched_task(curr);
+       /*
+        * Only set the backward buddy when the current task is still
+        * on the rq. This can happen when a wakeup gets interleaved
+        * with schedule on the ->pre_schedule() or idle_balance()
+        * point, either of which can * drop the rq lock.
+        *
+        * Also, during early boot the idle thread is in the fair class,
+        * for obvious reasons its a bad idea to schedule back to it.
+        */
+       if (unlikely(!se->on_rq || curr == rq->idle))
+               return;
+
+       if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+               set_last_buddy(se);
+}
+
+static struct task_struct *pick_next_task_fair(struct rq *rq)
+{
+       struct task_struct *p;
+       struct cfs_rq *cfs_rq = &rq->cfs;
+       struct sched_entity *se;
+
+       if (!cfs_rq->nr_running)
+               return NULL;
+
+       do {
+               se = pick_next_entity(cfs_rq);
+               set_next_entity(cfs_rq, se);
+               cfs_rq = group_cfs_rq(se);
+       } while (cfs_rq);
+
+       p = task_of(se);
+       if (hrtick_enabled(rq))
+               hrtick_start_fair(rq, p);
+
+       return p;
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+       struct sched_entity *se = &prev->se;
+       struct cfs_rq *cfs_rq;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               put_prev_entity(cfs_rq, se);
+       }
+}
+
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       struct sched_entity *se = &curr->se;
+
+       /*
+        * Are we the only task in the tree?
+        */
+       if (unlikely(rq->nr_running == 1))
+               return;
+
+       clear_buddies(cfs_rq, se);
+
+       if (curr->policy != SCHED_BATCH) {
+               update_rq_clock(rq);
+               /*
+                * Update run-time statistics of the 'current'.
+                */
+               update_curr(cfs_rq);
+               /*
+                * Tell update_rq_clock() that we've just updated,
+                * so we don't do microscopic update in schedule()
+                * and double the fastpath cost.
+                */
+                rq->skip_clock_update = 1;
+       }
+
+       set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+       struct sched_entity *se = &p->se;
+
+       /* throttled hierarchies are not runnable */
+       if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+               return false;
+
+       /* Tell the scheduler that we'd really like pse to run next. */
+       set_next_buddy(se);
+
+       yield_task_fair(rq);
+
+       return true;
+}
+
+#ifdef CONFIG_SMP
+/**************************************************
+ * Fair scheduling class load-balancing methods:
+ */
+
+/*
+ * pull_task - move a task from a remote runqueue to the local runqueue.
+ * Both runqueues must be locked.
+ */
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+                     struct rq *this_rq, int this_cpu)
+{
+       deactivate_task(src_rq, p, 0);
+       set_task_cpu(p, this_cpu);
+       activate_task(this_rq, p, 0);
+       check_preempt_curr(this_rq, p, 0);
+}
+
+/*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+       s64 delta;
+
+       if (p->sched_class != &fair_sched_class)
+               return 0;
+
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+
+       /*
+        * Buddy candidates are cache hot:
+        */
+       if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+                       (&p->se == cfs_rq_of(&p->se)->next ||
+                        &p->se == cfs_rq_of(&p->se)->last))
+               return 1;
+
+       if (sysctl_sched_migration_cost == -1)
+               return 1;
+       if (sysctl_sched_migration_cost == 0)
+               return 0;
+
+       delta = now - p->se.exec_start;
+
+       return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02
+#define LBF_ABORT      0x04
+
+/*
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+static
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
+                    struct sched_domain *sd, enum cpu_idle_type idle,
+                    int *lb_flags)
+{
+       int tsk_cache_hot = 0;
+       /*
+        * We do not migrate tasks that are:
+        * 1) running (obviously), or
+        * 2) cannot be migrated to this CPU due to cpus_allowed, or
+        * 3) are cache-hot on their current CPU.
+        */
+       if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
+               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+               return 0;
+       }
+       *lb_flags &= ~LBF_ALL_PINNED;
+
+       if (task_running(rq, p)) {
+               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+               return 0;
+       }
+
+       /*
+        * Aggressive migration if:
+        * 1) task is cache cold, or
+        * 2) too many balance attempts have failed.
+        */
+
+       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
+       if (!tsk_cache_hot ||
+               sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(sd, lb_hot_gained[idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
+       if (tsk_cache_hot) {
+               schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+               return 0;
+       }
+       return 1;
+}
+
+/*
+ * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * part of active balancing operations within "domain".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int
+move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+             struct sched_domain *sd, enum cpu_idle_type idle)
+{
+       struct task_struct *p, *n;
+       struct cfs_rq *cfs_rq;
+       int pinned = 0;
+
+       for_each_leaf_cfs_rq(busiest, cfs_rq) {
+               list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                       if (throttled_lb_pair(task_group(p),
+                                             busiest->cpu, this_cpu))
+                               break;
+
+                       if (!can_migrate_task(p, busiest, this_cpu,
+                                               sd, idle, &pinned))
+                               continue;
+
+                       pull_task(busiest, p, this_rq, this_cpu);
+                       /*
+                        * Right now, this is only the second place pull_task()
+                        * is called, so we can safely collect pull_task()
+                        * stats here rather than inside pull_task().
+                        */
+                       schedstat_inc(sd, lb_gained[idle]);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static unsigned long
+balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+             unsigned long max_load_move, struct sched_domain *sd,
+             enum cpu_idle_type idle, int *lb_flags,
+             struct cfs_rq *busiest_cfs_rq)
+{
+       int loops = 0, pulled = 0;
+       long rem_load_move = max_load_move;
+       struct task_struct *p, *n;
+
+       if (max_load_move == 0)
+               goto out;
+
+       list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
+               if (loops++ > sysctl_sched_nr_migrate) {
+                       *lb_flags |= LBF_NEED_BREAK;
+                       break;
+               }
+
+               if ((p->se.load.weight >> 1) > rem_load_move ||
+                   !can_migrate_task(p, busiest, this_cpu, sd, idle,
+                                     lb_flags))
+                       continue;
+
+               pull_task(busiest, p, this_rq, this_cpu);
+               pulled++;
+               rem_load_move -= p->se.load.weight;
+
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
+               if (idle == CPU_NEWLY_IDLE) {
+                       *lb_flags |= LBF_ABORT;
+                       break;
+               }
+#endif
+
+               /*
+                * We only want to steal up to the prescribed amount of
+                * weighted load.
+                */
+               if (rem_load_move <= 0)
+                       break;
+       }
+out:
+       /*
+        * Right now, this is one of only two places pull_task() is called,
+        * so we can safely collect pull_task() stats here rather than
+        * inside pull_task().
+        */
+       schedstat_add(sd, lb_gained[idle], pulled);
+
+       return max_load_move - rem_load_move;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       /*
+        * Iterates the task_group tree in a bottom up fashion, see
+        * list_add_leaf_cfs_rq() for details.
+        */
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               /* throttled entities do not contribute to load */
+               if (throttled_hierarchy(cfs_rq))
+                       continue;
+
+               update_shares_cpu(cfs_rq->tg, cpu);
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+       unsigned long load;
+       long cpu = (long)data;
+
+       if (!tg->parent) {
+               load = cpu_rq(cpu)->load.weight;
+       } else {
+               load = tg->parent->cfs_rq[cpu]->h_load;
+               load *= tg->se[cpu]->load.weight;
+               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+       }
+
+       tg->cfs_rq[cpu]->h_load = load;
+
+       return 0;
+}
+
+static void update_h_load(long cpu)
+{
+       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
+
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 unsigned long max_load_move,
+                 struct sched_domain *sd, enum cpu_idle_type idle,
+                 int *lb_flags)
+{
+       long rem_load_move = max_load_move;
+       struct cfs_rq *busiest_cfs_rq;
+
+       rcu_read_lock();
+       update_h_load(cpu_of(busiest));
+
+       for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
+               unsigned long busiest_h_load = busiest_cfs_rq->h_load;
+               unsigned long busiest_weight = busiest_cfs_rq->load.weight;
+               u64 rem_load, moved_load;
+
+               if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                       break;
+
+               /*
+                * empty group or part of a throttled hierarchy
+                */
+               if (!busiest_cfs_rq->task_weight ||
+                   throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
+                       continue;
+
+               rem_load = (u64)rem_load_move * busiest_weight;
+               rem_load = div_u64(rem_load, busiest_h_load + 1);
+
+               moved_load = balance_tasks(this_rq, this_cpu, busiest,
+                               rem_load, sd, idle, lb_flags,
+                               busiest_cfs_rq);
+
+               if (!moved_load)
+                       continue;
+
+               moved_load *= busiest_h_load;
+               moved_load = div_u64(moved_load, busiest_weight + 1);
+
+               rem_load_move -= moved_load;
+               if (rem_load_move < 0)
+                       break;
+       }
+       rcu_read_unlock();
+
+       return max_load_move - rem_load_move;
+}
+#else
+static inline void update_shares(int cpu)
+{
+}
+
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 unsigned long max_load_move,
+                 struct sched_domain *sd, enum cpu_idle_type idle,
+                 int *lb_flags)
+{
+       return balance_tasks(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, lb_flags,
+                       &busiest->cfs);
+}
+#endif
+
+/*
+ * move_tasks tries to move up to max_load_move weighted load from busiest to
+ * this_rq, as part of a balancing operation within domain "sd".
+ * Returns 1 if successful and 0 otherwise.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                     unsigned long max_load_move,
+                     struct sched_domain *sd, enum cpu_idle_type idle,
+                     int *lb_flags)
+{
+       unsigned long total_load_moved = 0, load_moved;
+
+       do {
+               load_moved = load_balance_fair(this_rq, this_cpu, busiest,
+                               max_load_move - total_load_moved,
+                               sd, idle, lb_flags);
+
+               total_load_moved += load_moved;
+
+               if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                       break;
+
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
+               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
+                       *lb_flags |= LBF_ABORT;
+                       break;
+               }
+#endif
+       } while (load_moved && max_load_move > total_load_moved);
+
+       return total_load_moved > 0;
+}
+
+/********** Helpers for find_busiest_group ************************/
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *             during load balancing.
+ */
+struct sd_lb_stats {
+       struct sched_group *busiest; /* Busiest group in this sd */
+       struct sched_group *this;  /* Local group in this sd */
+       unsigned long total_load;  /* Total load of all groups in sd */
+       unsigned long total_pwr;   /*   Total power of all groups in sd */
+       unsigned long avg_load;    /* Average load across all groups in sd */
+
+       /** Statistics of this group */
+       unsigned long this_load;
+       unsigned long this_load_per_task;
+       unsigned long this_nr_running;
+       unsigned long this_has_capacity;
+       unsigned int  this_idle_cpus;
+
+       /* Statistics of the busiest group */
+       unsigned int  busiest_idle_cpus;
+       unsigned long max_load;
+       unsigned long busiest_load_per_task;
+       unsigned long busiest_nr_running;
+       unsigned long busiest_group_capacity;
+       unsigned long busiest_has_capacity;
+       unsigned int  busiest_group_weight;
+
+       int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       int power_savings_balance; /* Is powersave balance needed for this sd */
+       struct sched_group *group_min; /* Least loaded group in sd */
+       struct sched_group *group_leader; /* Group which relieves group_min */
+       unsigned long min_load_per_task; /* load_per_task in group_min */
+       unsigned long leader_nr_running; /* Nr running of group_leader */
+       unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+       unsigned long avg_load; /*Avg load across the CPUs of the group */
+       unsigned long group_load; /* Total load over the CPUs of the group */
+       unsigned long sum_nr_running; /* Nr tasks running in the group */
+       unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+       unsigned long group_capacity;
+       unsigned long idle_cpus;
+       unsigned long group_weight;
+       int group_imb; /* Is there an imbalance in the group ? */
+       int group_has_capacity; /* Is there extra capacity in the group? */
+};
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                       enum cpu_idle_type idle)
+{
+       int load_idx;
+
+       switch (idle) {
+       case CPU_NOT_IDLE:
+               load_idx = sd->busy_idx;
+               break;
+
+       case CPU_NEWLY_IDLE:
+               load_idx = sd->newidle_idx;
+               break;
+       default:
+               load_idx = sd->idle_idx;
+               break;
+       }
+
+       return load_idx;
+}
+
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       /*
+        * Busy processors will not participate in power savings
+        * balance.
+        */
+       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               sds->power_savings_balance = 0;
+       else {
+               sds->power_savings_balance = 1;
+               sds->min_nr_running = ULONG_MAX;
+               sds->leader_nr_running = 0;
+       }
+}
+
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *             load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+       if (!sds->power_savings_balance)
+               return;
+
+       /*
+        * If the local group is idle or completely loaded
+        * no need to do power savings balance at this domain
+        */
+       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                               !sds->this_nr_running))
+               sds->power_savings_balance = 0;
+
+       /*
+        * If a group is already running at full capacity or idle,
+        * don't include that group in power savings calculations
+        */
+       if (!sds->power_savings_balance ||
+               sgs->sum_nr_running >= sgs->group_capacity ||
+               !sgs->sum_nr_running)
+               return;
+
+       /*
+        * Calculate the group which has the least non-idle load.
+        * This is the group from where we need to pick up the load
+        * for saving power
+        */
+       if ((sgs->sum_nr_running < sds->min_nr_running) ||
+           (sgs->sum_nr_running == sds->min_nr_running &&
+            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+               sds->group_min = group;
+               sds->min_nr_running = sgs->sum_nr_running;
+               sds->min_load_per_task = sgs->sum_weighted_load /
+                                               sgs->sum_nr_running;
+       }
+
+       /*
+        * Calculate the group which is almost near its
+        * capacity but still has some space to pick up some load
+        * from other group and save more power
+        */
+       if (sgs->sum_nr_running + 1 > sgs->group_capacity)
+               return;
+
+       if (sgs->sum_nr_running > sds->leader_nr_running ||
+           (sgs->sum_nr_running == sds->leader_nr_running &&
+            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+               sds->group_leader = group;
+               sds->leader_nr_running = sgs->sum_nr_running;
+       }
+}
+
+/**
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *     under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       if (!sds->power_savings_balance)
+               return 0;
+
+       if (sds->this != sds->group_leader ||
+                       sds->group_leader == sds->group_min)
+               return 0;
+
+       *imbalance = sds->min_load_per_task;
+       sds->busiest = sds->group_min;
+
+       return 1;
+
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+       struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+       return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+       return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                       int this_cpu, unsigned long *imbalance)
+{
+       return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+       return SCHED_POWER_SCALE;
+}
+
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+       return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+       unsigned long weight = sd->span_weight;
+       unsigned long smt_gain = sd->smt_gain;
+
+       smt_gain /= weight;
+
+       return smt_gain;
+}
+
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+       return default_scale_smt_power(sd, cpu);
+}
+
+unsigned long scale_rt_power(int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       u64 total, available;
+
+       total = sched_avg_period() + (rq->clock - rq->age_stamp);
+
+       if (unlikely(total < rq->rt_avg)) {
+               /* Ensures that power won't end up being negative */
+               available = 0;
+       } else {
+               available = total - rq->rt_avg;
+       }
+
+       if (unlikely((s64)total < SCHED_POWER_SCALE))
+               total = SCHED_POWER_SCALE;
+
+       total >>= SCHED_POWER_SHIFT;
+
+       return div_u64(available, total);
+}
+
+static void update_cpu_power(struct sched_domain *sd, int cpu)
+{
+       unsigned long weight = sd->span_weight;
+       unsigned long power = SCHED_POWER_SCALE;
+       struct sched_group *sdg = sd->groups;
+
+       if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+               if (sched_feat(ARCH_POWER))
+                       power *= arch_scale_smt_power(sd, cpu);
+               else
+                       power *= default_scale_smt_power(sd, cpu);
+
+               power >>= SCHED_POWER_SHIFT;
+       }
+
+       sdg->sgp->power_orig = power;
+
+       if (sched_feat(ARCH_POWER))
+               power *= arch_scale_freq_power(sd, cpu);
+       else
+               power *= default_scale_freq_power(sd, cpu);
+
+       power >>= SCHED_POWER_SHIFT;
+
+       power *= scale_rt_power(cpu);
+       power >>= SCHED_POWER_SHIFT;
+
+       if (!power)
+               power = 1;
+
+       cpu_rq(cpu)->cpu_power = power;
+       sdg->sgp->power = power;
+}
+
+void update_group_power(struct sched_domain *sd, int cpu)
+{
+       struct sched_domain *child = sd->child;
+       struct sched_group *group, *sdg = sd->groups;
+       unsigned long power;
+
+       if (!child) {
+               update_cpu_power(sd, cpu);
+               return;
+       }
+
+       power = 0;
+
+       group = child->groups;
+       do {
+               power += group->sgp->power;
+               group = group->next;
+       } while (group != child->groups);
+
+       sdg->sgp->power = power;
+}
+
+/*
+ * Try and fix up capacity for tiny siblings, this is needed when
+ * things like SD_ASYM_PACKING need f_b_g to select another sibling
+ * which on its own isn't powerful enough.
+ *
+ * See update_sd_pick_busiest() and check_asym_packing().
+ */
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
+{
+       /*
+        * Only siblings can have significantly less than SCHED_POWER_SCALE
+        */
+       if (!(sd->flags & SD_SHARE_CPUPOWER))
+               return 0;
+
+       /*
+        * If ~90% of the cpu_power is still there, we're good.
+        */
+       if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+               return 1;
+
+       return 0;
+}
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_domain *sd,
+                       struct sched_group *group, int this_cpu,
+                       enum cpu_idle_type idle, int load_idx,
+                       int local_group, const struct cpumask *cpus,
+                       int *balance, struct sg_lb_stats *sgs)
+{
+       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
+       int i;
+       unsigned int balance_cpu = -1, first_idle_cpu = 0;
+       unsigned long avg_load_per_task = 0;
+
+       if (local_group)
+               balance_cpu = group_first_cpu(group);
+
+       /* Tally up the load of all CPUs in the group */
+       max_cpu_load = 0;
+       min_cpu_load = ~0UL;
+       max_nr_running = 0;
+
+       for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+               struct rq *rq = cpu_rq(i);
+
+               /* Bias balancing toward cpus of our domain */
+               if (local_group) {
+                       if (idle_cpu(i) && !first_idle_cpu) {
+                               first_idle_cpu = 1;
+                               balance_cpu = i;
+                       }
+
+                       load = target_load(i, load_idx);
+               } else {
+                       load = source_load(i, load_idx);
+                       if (load > max_cpu_load) {
+                               max_cpu_load = load;
+                               max_nr_running = rq->nr_running;
+                       }
+                       if (min_cpu_load > load)
+                               min_cpu_load = load;
+               }
+
+               sgs->group_load += load;
+               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_weighted_load += weighted_cpuload(i);
+               if (idle_cpu(i))
+                       sgs->idle_cpus++;
+       }
+
+       /*
+        * First idle cpu or the first cpu(busiest) in this sched group
+        * is eligible for doing load balancing at this and above
+        * domains. In the newly idle case, we will allow all the cpu's
+        * to do the newly idle load balance.
+        */
+       if (idle != CPU_NEWLY_IDLE && local_group) {
+               if (balance_cpu != this_cpu) {
+                       *balance = 0;
+                       return;
+               }
+               update_group_power(sd, this_cpu);
+       }
+
+       /* Adjust by relative CPU power of the group */
+       sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+
+       /*
+        * Consider the group unbalanced when the imbalance is larger
+        * than the average weight of a task.
+        *
+        * APZ: with cgroup the avg task weight can vary wildly and
+        *      might not be a suitable number - should we keep a
+        *      normalized nr_running number somewhere that negates
+        *      the hierarchy?
+        */
+       if (sgs->sum_nr_running)
+               avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+
+       if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
+               sgs->group_imb = 1;
+
+       sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
+                                               SCHED_POWER_SCALE);
+       if (!sgs->group_capacity)
+               sgs->group_capacity = fix_small_capacity(sd, group);
+       sgs->group_weight = group->group_weight;
+
+       if (sgs->group_capacity > sgs->sum_nr_running)
+               sgs->group_has_capacity = 1;
+}
+
+/**
+ * update_sd_pick_busiest - return 1 on busiest group
+ * @sd: sched_domain whose statistics are to be checked
+ * @sds: sched_domain statistics
+ * @sg: sched_group candidate to be checked for being the busiest
+ * @sgs: sched_group statistics
+ * @this_cpu: the current cpu
+ *
+ * Determine if @sg is a busier group than the previously selected
+ * busiest group.
+ */
+static bool update_sd_pick_busiest(struct sched_domain *sd,
+                                  struct sd_lb_stats *sds,
+                                  struct sched_group *sg,
+                                  struct sg_lb_stats *sgs,
+                                  int this_cpu)
+{
+       if (sgs->avg_load <= sds->max_load)
+               return false;
+
+       if (sgs->sum_nr_running > sgs->group_capacity)
+               return true;
+
+       if (sgs->group_imb)
+               return true;
+
+       /*
+        * ASYM_PACKING needs to move all the work to the lowest
+        * numbered CPUs in the group, therefore mark all groups
+        * higher than ourself as busy.
+        */
+       if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+           this_cpu < group_first_cpu(sg)) {
+               if (!sds->busiest)
+                       return true;
+
+               if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+                       return true;
+       }
+
+       return false;
+}
+
+/**
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                       enum cpu_idle_type idle, const struct cpumask *cpus,
+                       int *balance, struct sd_lb_stats *sds)
+{
+       struct sched_domain *child = sd->child;
+       struct sched_group *sg = sd->groups;
+       struct sg_lb_stats sgs;
+       int load_idx, prefer_sibling = 0;
+
+       if (child && child->flags & SD_PREFER_SIBLING)
+               prefer_sibling = 1;
+
+       init_sd_power_savings_stats(sd, sds, idle);
+       load_idx = get_sd_load_idx(sd, idle);
+
+       do {
+               int local_group;
+
+               local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+               memset(&sgs, 0, sizeof(sgs));
+               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
+                               local_group, cpus, balance, &sgs);
+
+               if (local_group && !(*balance))
+                       return;
+
+               sds->total_load += sgs.group_load;
+               sds->total_pwr += sg->sgp->power;
+
+               /*
+                * In case the child domain prefers tasks go to siblings
+                * first, lower the sg capacity to one so that we'll try
+                * and move all the excess tasks away. We lower the capacity
+                * of a group only if the local group has the capacity to fit
+                * these excess tasks, i.e. nr_running < group_capacity. The
+                * extra check prevents the case where you always pull from the
+                * heaviest group when it is already under-utilized (possible
+                * with a large weight task outweighs the tasks on the system).
+                */
+               if (prefer_sibling && !local_group && sds->this_has_capacity)
+                       sgs.group_capacity = min(sgs.group_capacity, 1UL);
+
+               if (local_group) {
+                       sds->this_load = sgs.avg_load;
+                       sds->this = sg;
+                       sds->this_nr_running = sgs.sum_nr_running;
+                       sds->this_load_per_task = sgs.sum_weighted_load;
+                       sds->this_has_capacity = sgs.group_has_capacity;
+                       sds->this_idle_cpus = sgs.idle_cpus;
+               } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+                       sds->max_load = sgs.avg_load;
+                       sds->busiest = sg;
+                       sds->busiest_nr_running = sgs.sum_nr_running;
+                       sds->busiest_idle_cpus = sgs.idle_cpus;
+                       sds->busiest_group_capacity = sgs.group_capacity;
+                       sds->busiest_load_per_task = sgs.sum_weighted_load;
+                       sds->busiest_has_capacity = sgs.group_has_capacity;
+                       sds->busiest_group_weight = sgs.group_weight;
+                       sds->group_imb = sgs.group_imb;
+               }
+
+               update_sd_power_savings_stats(sg, sds, local_group, &sgs);
+               sg = sg->next;
+       } while (sg != sd->groups);
+}
+
+/**
+ * check_asym_packing - Check to see if the group is packed into the
+ *                     sched doman.
+ *
+ * This is primarily intended to used at the sibling level.  Some
+ * cores like POWER7 prefer to use lower numbered SMT threads.  In the
+ * case of POWER7, it can move to lower SMT modes only when higher
+ * threads are idle.  When in lower SMT modes, the threads will
+ * perform better since they share less core resources.  Hence when we
+ * have idle threads, we want them to be the higher ones.
+ *
+ * This packing function is run on idle threads.  It checks to see if
+ * the busiest CPU in this domain (core in the P7 case) has a higher
+ * CPU number than the packing function is being run on.  Here we are
+ * assuming lower CPU number will be equivalent to lower a SMT thread
+ * number.
+ *
+ * Returns 1 when packing is required and a task should be moved to
+ * this CPU.  The amount of the imbalance is returned in *imbalance.
+ *
+ * @sd: The sched_domain whose packing is to be checked.
+ * @sds: Statistics of the sched_domain which is to be packed
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: returns amount of imbalanced due to packing.
+ */
+static int check_asym_packing(struct sched_domain *sd,
+                             struct sd_lb_stats *sds,
+                             int this_cpu, unsigned long *imbalance)
+{
+       int busiest_cpu;
+
+       if (!(sd->flags & SD_ASYM_PACKING))
+               return 0;
+
+       if (!sds->busiest)
+               return 0;
+
+       busiest_cpu = group_first_cpu(sds->busiest);
+       if (this_cpu > busiest_cpu)
+               return 0;
+
+       *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
+                                      SCHED_POWER_SCALE);
+       return 1;
+}
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ *                     amongst the groups of a sched_domain, during
+ *                     load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                               int this_cpu, unsigned long *imbalance)
+{
+       unsigned long tmp, pwr_now = 0, pwr_move = 0;
+       unsigned int imbn = 2;
+       unsigned long scaled_busy_load_per_task;
+
+       if (sds->this_nr_running) {
+               sds->this_load_per_task /= sds->this_nr_running;
+               if (sds->busiest_load_per_task >
+                               sds->this_load_per_task)
+                       imbn = 1;
+       } else
+               sds->this_load_per_task =
+                       cpu_avg_load_per_task(this_cpu);
+
+       scaled_busy_load_per_task = sds->busiest_load_per_task
+                                        * SCHED_POWER_SCALE;
+       scaled_busy_load_per_task /= sds->busiest->sgp->power;
+
+       if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+                       (scaled_busy_load_per_task * imbn)) {
+               *imbalance = sds->busiest_load_per_task;
+               return;
+       }
+
+       /*
+        * OK, we don't have enough imbalance to justify moving tasks,
+        * however we may be able to increase total CPU power used by
+        * moving them.
+        */
+
+       pwr_now += sds->busiest->sgp->power *
+                       min(sds->busiest_load_per_task, sds->max_load);
+       pwr_now += sds->this->sgp->power *
+                       min(sds->this_load_per_task, sds->this_load);
+       pwr_now /= SCHED_POWER_SCALE;
+
+       /* Amount of load we'd subtract */
+       tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+               sds->busiest->sgp->power;
+       if (sds->max_load > tmp)
+               pwr_move += sds->busiest->sgp->power *
+                       min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+       /* Amount of load we'd add */
+       if (sds->max_load * sds->busiest->sgp->power <
+               sds->busiest_load_per_task * SCHED_POWER_SCALE)
+               tmp = (sds->max_load * sds->busiest->sgp->power) /
+                       sds->this->sgp->power;
+       else
+               tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+                       sds->this->sgp->power;
+       pwr_move += sds->this->sgp->power *
+                       min(sds->this_load_per_task, sds->this_load + tmp);
+       pwr_move /= SCHED_POWER_SCALE;
+
+       /* Move if we gain throughput */
+       if (pwr_move > pwr_now)
+               *imbalance = sds->busiest_load_per_task;
+}
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                      groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+               unsigned long *imbalance)
+{
+       unsigned long max_pull, load_above_capacity = ~0UL;
+
+       sds->busiest_load_per_task /= sds->busiest_nr_running;
+       if (sds->group_imb) {
+               sds->busiest_load_per_task =
+                       min(sds->busiest_load_per_task, sds->avg_load);
+       }
+
+       /*
+        * In the presence of smp nice balancing, certain scenarios can have
+        * max load less than avg load(as we skip the groups at or below
+        * its cpu_power, while calculating max_load..)
+        */
+       if (sds->max_load < sds->avg_load) {
+               *imbalance = 0;
+               return fix_small_imbalance(sds, this_cpu, imbalance);
+       }
+
+       if (!sds->group_imb) {
+               /*
+                * Don't want to pull so many tasks that a group would go idle.
+                */
+               load_above_capacity = (sds->busiest_nr_running -
+                                               sds->busiest_group_capacity);
+
+               load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
+
+               load_above_capacity /= sds->busiest->sgp->power;
+       }
+
+       /*
+        * We're trying to get all the cpus to the average_load, so we don't
+        * want to push ourselves above the average load, nor do we wish to
+        * reduce the max loaded cpu below the average load. At the same time,
+        * we also don't want to reduce the group load below the group capacity
+        * (so that we can implement power-savings policies etc). Thus we look
+        * for the minimum possible imbalance.
+        * Be careful of negative numbers as they'll appear as very large values
+        * with unsigned longs.
+        */
+       max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+
+       /* How much load to actually move to equalise the imbalance */
+       *imbalance = min(max_pull * sds->busiest->sgp->power,
+               (sds->avg_load - sds->this_load) * sds->this->sgp->power)
+                       / SCHED_POWER_SCALE;
+
+       /*
+        * if *imbalance is less than the average load per runnable task
+        * there is no guarantee that any tasks will be moved so we'll have
+        * a think about bumping its value to force at least one task to be
+        * moved
+        */
+       if (*imbalance < sds->busiest_load_per_task)
+               return fix_small_imbalance(sds, this_cpu, imbalance);
+
+}
+
+/******* find_busiest_group() helpers end here *********************/
+
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *             be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *     is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:    - the busiest group if imbalance exists.
+ *             - If no imbalance and user has opted for power-savings balance,
+ *                return the least loaded group whose CPUs can be
+ *                put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                  unsigned long *imbalance, enum cpu_idle_type idle,
+                  const struct cpumask *cpus, int *balance)
+{
+       struct sd_lb_stats sds;
+
+       memset(&sds, 0, sizeof(sds));
+
+       /*
+        * Compute the various statistics relavent for load balancing at
+        * this level.
+        */
+       update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+
+       /*
+        * this_cpu is not the appropriate cpu to perform load balancing at
+        * this level.
+        */
+       if (!(*balance))
+               goto ret;
+
+       if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+           check_asym_packing(sd, &sds, this_cpu, imbalance))
+               return sds.busiest;
+
+       /* There is no busy sibling group to pull tasks from */
+       if (!sds.busiest || sds.busiest_nr_running == 0)
+               goto out_balanced;
+
+       sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+
+       /*
+        * If the busiest group is imbalanced the below checks don't
+        * work because they assumes all things are equal, which typically
+        * isn't true due to cpus_allowed constraints and the like.
+        */
+       if (sds.group_imb)
+               goto force_balance;
+
+       /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                       !sds.busiest_has_capacity)
+               goto force_balance;
+
+       /*
+        * If the local group is more busy than the selected busiest group
+        * don't try and pull any tasks.
+        */
+       if (sds.this_load >= sds.max_load)
+               goto out_balanced;
+
+       /*
+        * Don't pull any tasks if this group is already above the domain
+        * average load.
+        */
+       if (sds.this_load >= sds.avg_load)
+               goto out_balanced;
+
+       if (idle == CPU_IDLE) {
+               /*
+                * This cpu is idle. If the busiest group load doesn't
+                * have more tasks than the number of available cpu's and
+                * there is no imbalance between this and busiest group
+                * wrt to idle cpu's, it is balanced.
+                */
+               if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+                   sds.busiest_nr_running <= sds.busiest_group_weight)
+                       goto out_balanced;
+       } else {
+               /*
+                * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+                * imbalance_pct to be conservative.
+                */
+               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                       goto out_balanced;
+       }
+
+force_balance:
+       /* Looks like there is an imbalance. Compute it */
+       calculate_imbalance(&sds, this_cpu, imbalance);
+       return sds.busiest;
+
+out_balanced:
+       /*
+        * There is no obvious imbalance. But check if we can do some balancing
+        * to save power.
+        */
+       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+               return sds.busiest;
+ret:
+       *imbalance = 0;
+       return NULL;
+}
+
+/*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+static struct rq *
+find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+                  enum cpu_idle_type idle, unsigned long imbalance,
+                  const struct cpumask *cpus)
+{
+       struct rq *busiest = NULL, *rq;
+       unsigned long max_load = 0;
+       int i;
+
+       for_each_cpu(i, sched_group_cpus(group)) {
+               unsigned long power = power_of(i);
+               unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                                                          SCHED_POWER_SCALE);
+               unsigned long wl;
+
+               if (!capacity)
+                       capacity = fix_small_capacity(sd, group);
+
+               if (!cpumask_test_cpu(i, cpus))
+                       continue;
+
+               rq = cpu_rq(i);
+               wl = weighted_cpuload(i);
+
+               /*
+                * When comparing with imbalance, use weighted_cpuload()
+                * which is not scaled with the cpu power.
+                */
+               if (capacity && rq->nr_running == 1 && wl > imbalance)
+                       continue;
+
+               /*
+                * For the load comparisons with the other cpu's, consider
+                * the weighted_cpuload() scaled with the cpu power, so that
+                * the load can be moved away from the cpu that is potentially
+                * running at a lower capacity.
+                */
+               wl = (wl * SCHED_POWER_SCALE) / power;
+
+               if (wl > max_load) {
+                       max_load = wl;
+                       busiest = rq;
+               }
+       }
+
+       return busiest;
+}
+
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL    512
+
+/* Working cpumask for load_balance and load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
+static int need_active_balance(struct sched_domain *sd, int idle,
+                              int busiest_cpu, int this_cpu)
+{
+       if (idle == CPU_NEWLY_IDLE) {
+
+               /*
+                * ASYM_PACKING needs to force migrate tasks from busy but
+                * higher numbered CPUs in order to pack all tasks in the
+                * lowest numbered CPUs.
+                */
+               if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                       return 1;
+
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package.
+                *
+                * The package power saving logic comes from
+                * find_busiest_group(). If there are no imbalance, then
+                * f_b_g() will return NULL. However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return 0;
+       }
+
+       return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
+}
+
+static int active_load_balance_cpu_stop(void *data);
+
+/*
+ * Check this_cpu to ensure it is balanced within domain. Attempt to move
+ * tasks if there is an imbalance.
+ */
+static int load_balance(int this_cpu, struct rq *this_rq,
+                       struct sched_domain *sd, enum cpu_idle_type idle,
+                       int *balance)
+{
+       int ld_moved, lb_flags = 0, active_balance = 0;
+       struct sched_group *group;
+       unsigned long imbalance;
+       struct rq *busiest;
+       unsigned long flags;
+       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+
+       cpumask_copy(cpus, cpu_active_mask);
+
+       schedstat_inc(sd, lb_count[idle]);
+
+redo:
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+                                  cpus, balance);
+
+       if (*balance == 0)
+               goto out_balanced;
+
+       if (!group) {
+               schedstat_inc(sd, lb_nobusyg[idle]);
+               goto out_balanced;
+       }
+
+       busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+       if (!busiest) {
+               schedstat_inc(sd, lb_nobusyq[idle]);
+               goto out_balanced;
+       }
+
+       BUG_ON(busiest == this_rq);
+
+       schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+       ld_moved = 0;
+       if (busiest->nr_running > 1) {
+               /*
+                * Attempt to move tasks. If find_busiest_group has found
+                * an imbalance but busiest->nr_running <= 1, the group is
+                * still unbalanced. ld_moved simply stays zero, so it is
+                * correctly treated as an imbalance.
+                */
+               lb_flags |= LBF_ALL_PINNED;
+               local_irq_save(flags);
+               double_rq_lock(this_rq, busiest);
+               ld_moved = move_tasks(this_rq, this_cpu, busiest,
+                                     imbalance, sd, idle, &lb_flags);
+               double_rq_unlock(this_rq, busiest);
+               local_irq_restore(flags);
+
+               /*
+                * some other cpu did the load balance for us.
+                */
+               if (ld_moved && this_cpu != smp_processor_id())
+                       resched_cpu(this_cpu);
+
+               if (lb_flags & LBF_ABORT)
+                       goto out_balanced;
+
+               if (lb_flags & LBF_NEED_BREAK) {
+                       lb_flags &= ~LBF_NEED_BREAK;
+                       goto redo;
+               }
+
+               /* All tasks on this runqueue were pinned by CPU affinity */
+               if (unlikely(lb_flags & LBF_ALL_PINNED)) {
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
+                               goto redo;
+                       goto out_balanced;
+               }
+       }
+
+       if (!ld_moved) {
+               schedstat_inc(sd, lb_failed[idle]);
+               /*
+                * Increment the failure counter only on periodic balance.
+                * We do not want newidle balance, which can be very
+                * frequent, pollute the failure counter causing
+                * excessive cache_hot migrations and active balances.
+                */
+               if (idle != CPU_NEWLY_IDLE)
+                       sd->nr_balance_failed++;
+
+               if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+                       raw_spin_lock_irqsave(&busiest->lock, flags);
+
+                       /* don't kick the active_load_balance_cpu_stop,
+                        * if the curr task on busiest cpu can't be
+                        * moved to this_cpu
+                        */
+                       if (!cpumask_test_cpu(this_cpu,
+                                       tsk_cpus_allowed(busiest->curr))) {
+                               raw_spin_unlock_irqrestore(&busiest->lock,
+                                                           flags);
+                               lb_flags |= LBF_ALL_PINNED;
+                               goto out_one_pinned;
+                       }
+
+                       /*
+                        * ->active_balance synchronizes accesses to
+                        * ->active_balance_work.  Once set, it's cleared
+                        * only after active load balance is finished.
+                        */
+                       if (!busiest->active_balance) {
+                               busiest->active_balance = 1;
+                               busiest->push_cpu = this_cpu;
+                               active_balance = 1;
+                       }
+                       raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
+                       if (active_balance)
+                               stop_one_cpu_nowait(cpu_of(busiest),
+                                       active_load_balance_cpu_stop, busiest,
+                                       &busiest->active_balance_work);
+
+                       /*
+                        * We've kicked active balancing, reset the failure
+                        * counter.
+                        */
+                       sd->nr_balance_failed = sd->cache_nice_tries+1;
+               }
+       } else
+               sd->nr_balance_failed = 0;
+
+       if (likely(!active_balance)) {
+               /* We were unbalanced, so reset the balancing interval */
+               sd->balance_interval = sd->min_interval;
+       } else {
+               /*
+                * If we've begun active balancing, start to back off. This
+                * case may not be covered by the all_pinned logic if there
+                * is only 1 task on the busy runqueue (because we don't call
+                * move_tasks).
+                */
+               if (sd->balance_interval < sd->max_interval)
+                       sd->balance_interval *= 2;
+       }
+
+       goto out;
+
+out_balanced:
+       schedstat_inc(sd, lb_balanced[idle]);
+
+       sd->nr_balance_failed = 0;
+
+out_one_pinned:
+       /* tune up the balancing interval */
+       if (((lb_flags & LBF_ALL_PINNED) &&
+                       sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                       (sd->balance_interval < sd->max_interval))
+               sd->balance_interval *= 2;
+
+       ld_moved = 0;
+out:
+       return ld_moved;
+}
+
+/*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
+ */
+void idle_balance(int this_cpu, struct rq *this_rq)
+{
+       struct sched_domain *sd;
+       int pulled_task = 0;
+       unsigned long next_balance = jiffies + HZ;
+
+       this_rq->idle_stamp = this_rq->clock;
+
+       if (this_rq->avg_idle < sysctl_sched_migration_cost)
+               return;
+
+       /*
+        * Drop the rq->lock, but keep IRQ/preempt disabled.
+        */
+       raw_spin_unlock(&this_rq->lock);
+
+       update_shares(this_cpu);
+       rcu_read_lock();
+       for_each_domain(this_cpu, sd) {
+               unsigned long interval;
+               int balance = 1;
+
+               if (!(sd->flags & SD_LOAD_BALANCE))
+                       continue;
+
+               if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       /* If we've pulled tasks over stop searching: */
+                       pulled_task = load_balance(this_cpu, this_rq,
+                                                  sd, CPU_NEWLY_IDLE, &balance);
+               }
+
+               interval = msecs_to_jiffies(sd->balance_interval);
+               if (time_after(next_balance, sd->last_balance + interval))
+                       next_balance = sd->last_balance + interval;
+               if (pulled_task) {
+                       this_rq->idle_stamp = 0;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       raw_spin_lock(&this_rq->lock);
+
+       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+               /*
+                * We are going idle. next_balance may be set based on
+                * a busy processor. So reset next_balance.
+                */
+               this_rq->next_balance = next_balance;
+       }
+}
+
+/*
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
+ */
+static int active_load_balance_cpu_stop(void *data)
+{
+       struct rq *busiest_rq = data;
+       int busiest_cpu = cpu_of(busiest_rq);
+       int target_cpu = busiest_rq->push_cpu;
+       struct rq *target_rq = cpu_rq(target_cpu);
+       struct sched_domain *sd;
+
+       raw_spin_lock_irq(&busiest_rq->lock);
+
+       /* make sure the requested cpu hasn't gone down in the meantime */
+       if (unlikely(busiest_cpu != smp_processor_id() ||
+                    !busiest_rq->active_balance))
+               goto out_unlock;
+
+       /* Is there any task to move? */
+       if (busiest_rq->nr_running <= 1)
+               goto out_unlock;
+
+       /*
+        * This condition is "impossible", if it occurs
+        * we need to fix it. Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
+        */
+       BUG_ON(busiest_rq == target_rq);
+
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
+
+       /* Search for an sd spanning us and the target CPU. */
+       rcu_read_lock();
+       for_each_domain(target_cpu, sd) {
+               if ((sd->flags & SD_LOAD_BALANCE) &&
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+                               break;
+       }
+
+       if (likely(sd)) {
+               schedstat_inc(sd, alb_count);
+
+               if (move_one_task(target_rq, target_cpu, busiest_rq,
+                                 sd, CPU_IDLE))
+                       schedstat_inc(sd, alb_pushed);
+               else
+                       schedstat_inc(sd, alb_failed);
+       }
+       rcu_read_unlock();
+       double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+       busiest_rq->active_balance = 0;
+       raw_spin_unlock_irq(&busiest_rq->lock);
+       return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * idle load balancing details
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ *   needed, they will kick the idle load balancer, which then does idle
+ *   load balancing for all the idle CPUs.
+ */
+static struct {
+       cpumask_var_t idle_cpus_mask;
+       atomic_t nr_cpus;
+       unsigned long next_balance;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:       The cpu whose lowest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the lowest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd)
+               if (sd->flags & flag)
+                       break;
+
+       return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:       The cpu whose domains we're iterating over.
+ * @sd:                variable holding the value of the power_savings_sd
+ *             for cpu.
+ * @flag:      The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+       for (sd = lowest_flag_domain(cpu, flag); \
+               (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:       The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:    Returns the id of the idle load balancer if it exists,
+ *             Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+       int ilb = cpumask_first(nohz.idle_cpus_mask);
+       struct sched_group *ilbg;
+       struct sched_domain *sd;
+
+       /*
+        * Have idle load balancer selection from semi-idle packages only
+        * when power-aware load balancing is enabled
+        */
+       if (!(sched_smt_power_savings || sched_mc_power_savings))
+               goto out_done;
+
+       /*
+        * Optimize for the case when we have no idle CPUs or only one
+        * idle CPU. Don't walk the sched_domain hierarchy in such cases
+        */
+       if (cpumask_weight(nohz.idle_cpus_mask) < 2)
+               goto out_done;
+
+       rcu_read_lock();
+       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+               ilbg = sd->groups;
+
+               do {
+                       if (ilbg->group_weight !=
+                               atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+                               ilb = cpumask_first_and(nohz.idle_cpus_mask,
+                                                       sched_group_cpus(ilbg));
+                               goto unlock;
+                       }
+
+                       ilbg = ilbg->next;
+
+               } while (ilbg != sd->groups);
+       }
+unlock:
+       rcu_read_unlock();
+
+out_done:
+       if (ilb < nr_cpu_ids && idle_cpu(ilb))
+               return ilb;
+
+       return nr_cpu_ids;
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+       return nr_cpu_ids;
+}
+#endif
+
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void nohz_balancer_kick(int cpu)
+{
+       int ilb_cpu;
+
+       nohz.next_balance++;
+
+       ilb_cpu = find_new_ilb(cpu);
+
+       if (ilb_cpu >= nr_cpu_ids)
+               return;
+
+       if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+               return;
+       /*
+        * Use smp_send_reschedule() instead of resched_cpu().
+        * This way we generate a sched IPI on the target cpu which
+        * is idle. And the softirq performing nohz idle load balance
+        * will be run before returning from the IPI.
+        */
+       smp_send_reschedule(ilb_cpu);
+       return;
+}
+
+static inline void set_cpu_sd_state_busy(void)
+{
+       struct sched_domain *sd;
+       int cpu = smp_processor_id();
+
+       if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+               return;
+       clear_bit(NOHZ_IDLE, nohz_flags(cpu));
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd)
+               atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+       rcu_read_unlock();
+}
+
+void set_cpu_sd_state_idle(void)
+{
+       struct sched_domain *sd;
+       int cpu = smp_processor_id();
+
+       if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+               return;
+       set_bit(NOHZ_IDLE, nohz_flags(cpu));
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd)
+               atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+       rcu_read_unlock();
+}
+
+/*
+ * This routine will record that this cpu is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+       int cpu = smp_processor_id();
+
+       if (stop_tick) {
+               if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+                       return;
+
+               cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+               atomic_inc(&nohz.nr_cpus);
+               set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+       }
+       return;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+void update_max_interval(void)
+{
+       max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
+/*
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+{
+       int balance = 1;
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long interval;
+       struct sched_domain *sd;
+       /* Earliest time when we have to do rebalance again */
+       unsigned long next_balance = jiffies + 60*HZ;
+       int update_next_balance = 0;
+       int need_serialize;
+
+       update_shares(cpu);
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               if (!(sd->flags & SD_LOAD_BALANCE))
+                       continue;
+
+               interval = sd->balance_interval;
+               if (idle != CPU_IDLE)
+                       interval *= sd->busy_factor;
+
+               /* scale ms to jiffies */
+               interval = msecs_to_jiffies(interval);
+               interval = clamp(interval, 1UL, max_load_balance_interval);
+
+               need_serialize = sd->flags & SD_SERIALIZE;
+
+               if (need_serialize) {
+                       if (!spin_trylock(&balancing))
+                               goto out;
+               }
+
+               if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance)) {
+                               /*
+                                * We've pulled tasks over so either we're no
+                                * longer idle.
+                                */
+                               idle = CPU_NOT_IDLE;
+                       }
+                       sd->last_balance = jiffies;
+               }
+               if (need_serialize)
+                       spin_unlock(&balancing);
+out:
+               if (time_after(next_balance, sd->last_balance + interval)) {
+                       next_balance = sd->last_balance + interval;
+                       update_next_balance = 1;
+               }
+
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!balance)
+                       break;
+       }
+       rcu_read_unlock();
+
+       /*
+        * next_balance will be updated only when there is a need.
+        * When the cpu is attached to null domain for ex, it will not be
+        * updated.
+        */
+       if (likely(update_next_balance))
+               rq->next_balance = next_balance;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+{
+       struct rq *this_rq = cpu_rq(this_cpu);
+       struct rq *rq;
+       int balance_cpu;
+
+       if (idle != CPU_IDLE ||
+           !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+               goto end;
+
+       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+               if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+                       continue;
+
+               /*
+                * If this cpu gets work to do, stop the load balancing
+                * work being done for other cpus. Next load
+                * balancing owner will pick it up.
+                */
+               if (need_resched())
+                       break;
+
+               raw_spin_lock_irq(&this_rq->lock);
+               update_rq_clock(this_rq);
+               update_cpu_load(this_rq);
+               raw_spin_unlock_irq(&this_rq->lock);
+
+               rebalance_domains(balance_cpu, CPU_IDLE);
+
+               rq = cpu_rq(balance_cpu);
+               if (time_after(this_rq->next_balance, rq->next_balance))
+                       this_rq->next_balance = rq->next_balance;
+       }
+       nohz.next_balance = this_rq->next_balance;
+end:
+       clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+}
+
+/*
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu is the system.
+ *   - This rq has more than one task.
+ *   - At any scheduler domain level, this cpu's scheduler group has multiple
+ *     busy cpu's exceeding the group's power.
+ *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ *     domain span are idle.
+ */
+static inline int nohz_kick_needed(struct rq *rq, int cpu)
+{
+       unsigned long now = jiffies;
+       struct sched_domain *sd;
+
+       if (unlikely(idle_cpu(cpu)))
+               return 0;
+
+       /*
+       * We may be recently in ticked or tickless idle mode. At the first
+       * busy tick after returning from idle, we will update the busy stats.
+       */
+       set_cpu_sd_state_busy();
+       if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+               clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+               cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+               atomic_dec(&nohz.nr_cpus);
+       }
+
+       /*
+        * None are in tickless mode and hence no need for NOHZ idle load
+        * balancing.
+        */
+       if (likely(!atomic_read(&nohz.nr_cpus)))
+               return 0;
+
+       if (time_before(now, nohz.next_balance))
+               return 0;
+
+       if (rq->nr_running >= 2)
+               goto need_kick;
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               struct sched_group *sg = sd->groups;
+               struct sched_group_power *sgp = sg->sgp;
+               int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+
+               if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
+                       goto need_kick_unlock;
+
+               if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+                   && (cpumask_first_and(nohz.idle_cpus_mask,
+                                         sched_domain_span(sd)) < cpu))
+                       goto need_kick_unlock;
+
+               if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+                       break;
+       }
+       rcu_read_unlock();
+       return 0;
+
+need_kick_unlock:
+       rcu_read_unlock();
+need_kick:
+       return 1;
+}
+#else
+static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+#endif
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+       int this_cpu = smp_processor_id();
+       struct rq *this_rq = cpu_rq(this_cpu);
+       enum cpu_idle_type idle = this_rq->idle_balance ?
+                                               CPU_IDLE : CPU_NOT_IDLE;
+
+       rebalance_domains(this_cpu, idle);
+
+       /*
+        * If this cpu has a pending nohz_balance_kick, then do the
+        * balancing on behalf of the other idle cpus whose ticks are
+        * stopped.
+        */
+       nohz_idle_balance(this_cpu, idle);
+}
+
+static inline int on_null_domain(int cpu)
+{
+       return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ */
+void trigger_load_balance(struct rq *rq, int cpu)
+{
+       /* Don't need to rebalance while attached to NULL domain */
+       if (time_after_eq(jiffies, rq->next_balance) &&
+           likely(!on_null_domain(cpu)))
+               raise_softirq(SCHED_SOFTIRQ);
+#ifdef CONFIG_NO_HZ
+       if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+               nohz_balancer_kick(cpu);
+#endif
+}
+
+static void rq_online_fair(struct rq *rq)
+{
+       update_sysctl();
+}
+
+static void rq_offline_fair(struct rq *rq)
+{
+       update_sysctl();
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &curr->se;
+
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+               entity_tick(cfs_rq, se, queued);
+       }
+}
+
+/*
+ * called on fork with the child task as argument from the parent's context
+ *  - child not yet on the tasklist
+ *  - preemption disabled
+ */
+static void task_fork_fair(struct task_struct *p)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se = &p->se, *curr;
+       int this_cpu = smp_processor_id();
+       struct rq *rq = this_rq();
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+
+       cfs_rq = task_cfs_rq(current);
+       curr = cfs_rq->curr;
+
+       if (unlikely(task_cpu(p) != this_cpu)) {
+               rcu_read_lock();
+               __set_task_cpu(p, this_cpu);
+               rcu_read_unlock();
+       }
+
+       update_curr(cfs_rq);
+
+       if (curr)
+               se->vruntime = curr->vruntime;
+       place_entity(cfs_rq, se, 1);
+
+       if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
+               /*
+                * Upon rescheduling, sched_class::put_prev_task() will place
+                * 'current' within the tree based on its new key value.
+                */
+               swap(curr->vruntime, se->vruntime);
+               resched_task(rq->curr);
+       }
+
+       se->vruntime -= cfs_rq->min_vruntime;
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
+{
+       if (!p->se.on_rq)
+               return;
+
+       /*
+        * Reschedule if we are currently running on this runqueue and
+        * our priority decreased, or if we are not currently running on
+        * this runqueue and our priority is higher than the current's
+        */
+       if (rq->curr == p) {
+               if (p->prio > oldprio)
+                       resched_task(rq->curr);
+       } else
+               check_preempt_curr(rq, p, 0);
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /*
+        * Ensure the task's vruntime is normalized, so that when its
+        * switched back to the fair class the enqueue_entity(.flags=0) will
+        * do the right thing.
+        *
+        * If it was on_rq, then the dequeue_entity(.flags=0) will already
+        * have normalized the vruntime, if it was !on_rq, then only when
+        * the task is sleeping will it still have non-normalized vruntime.
+        */
+       if (!se->on_rq && p->state != TASK_RUNNING) {
+               /*
+                * Fix up our vruntime so that the current sleep doesn't
+                * cause 'unlimited' sleep bonus.
+                */
+               place_entity(cfs_rq, se, 0);
+               se->vruntime -= cfs_rq->min_vruntime;
+       }
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+       if (!p->se.on_rq)
+               return;
+
+       /*
+        * We were most likely switched from sched_rt, so
+        * kick off the schedule if running, otherwise just see
+        * if we can still preempt the current task.
+        */
+       if (rq->curr == p)
+               resched_task(rq->curr);
+       else
+               check_preempt_curr(rq, p, 0);
+}
+
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+       struct sched_entity *se = &rq->curr->se;
+
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               set_next_entity(cfs_rq, se);
+               /* ensure bandwidth has been allocated on our new cfs_rq */
+               account_cfs_rq_runtime(cfs_rq, 0);
+       }
+}
+
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       cfs_rq->tasks_timeline = RB_ROOT;
+       INIT_LIST_HEAD(&cfs_rq->tasks);
+       cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_move_group_fair(struct task_struct *p, int on_rq)
+{
+       /*
+        * If the task was not on the rq at the time of this cgroup movement
+        * it must have been asleep, sleeping tasks keep their ->vruntime
+        * absolute on their old rq until wakeup (needed for the fair sleeper
+        * bonus in place_entity()).
+        *
+        * If it was on the rq, we've just 'preempted' it, which does convert
+        * ->vruntime to a relative base.
+        *
+        * Make sure both cases convert their relative position when migrating
+        * to another cgroup's rq. This does somewhat interfere with the
+        * fair sleeper stuff for the first placement, but who cares.
+        */
+       /*
+        * When !on_rq, vruntime of the task has usually NOT been normalized.
+        * But there are some cases where it has already been normalized:
+        *
+        * - Moving a forked child which is waiting for being woken up by
+        *   wake_up_new_task().
+        * - Moving a task which has been woken up by try_to_wake_up() and
+        *   waiting for actually being woken up by sched_ttwu_pending().
+        *
+        * To prevent boost or penalty in the new cfs_rq caused by delta
+        * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+        */
+       if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+               on_rq = 1;
+
+       if (!on_rq)
+               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+       set_task_rq(p, task_cpu(p));
+       if (!on_rq)
+               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+}
+
+void free_fair_sched_group(struct task_group *tg)
+{
+       int i;
+
+       destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+       for_each_possible_cpu(i) {
+               if (tg->cfs_rq)
+                       kfree(tg->cfs_rq[i]);
+               if (tg->se)
+                       kfree(tg->se[i]);
+       }
+
+       kfree(tg->cfs_rq);
+       kfree(tg->se);
+}
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+       struct cfs_rq *cfs_rq;
+       struct sched_entity *se;
+       int i;
+
+       tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+       if (!tg->cfs_rq)
+               goto err;
+       tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+       if (!tg->se)
+               goto err;
+
+       tg->shares = NICE_0_LOAD;
+
+       init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+       for_each_possible_cpu(i) {
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
+               if (!cfs_rq)
+                       goto err;
+
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
+               if (!se)
+                       goto err_free_rq;
+
+               init_cfs_rq(cfs_rq);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+       }
+
+       return 1;
+
+err_free_rq:
+       kfree(cfs_rq);
+err:
+       return 0;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                       struct sched_entity *se, int cpu,
+                       struct sched_entity *parent)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       cfs_rq->tg = tg;
+       cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+       /* allow initial update_cfs_load() to truncate */
+       cfs_rq->load_stamp = 1;
+#endif
+       init_cfs_rq_runtime(cfs_rq);
+
+       tg->cfs_rq[cpu] = cfs_rq;
+       tg->se[cpu] = se;
+
+       /* se could be NULL for root_task_group */
+       if (!se)
+               return;
+
+       if (!parent)
+               se->cfs_rq = &rq->cfs;
+       else
+               se->cfs_rq = parent->my_q;
+
+       se->my_q = cfs_rq;
+       update_load_set(&se->load, 0);
+       se->parent = parent;
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+       int i;
+       unsigned long flags;
+
+       /*
+        * We can't change the weight of the root cgroup.
+        */
+       if (!tg->se[0])
+               return -EINVAL;
+
+       shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+       mutex_lock(&shares_mutex);
+       if (tg->shares == shares)
+               goto done;
+
+       tg->shares = shares;
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se));
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
+
+done:
+       mutex_unlock(&shares_mutex);
+       return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+void free_fair_sched_group(struct task_group *tg) { }
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+       return 1;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+
+static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
+{
+       struct sched_entity *se = &task->se;
+       unsigned int rr_interval = 0;
+
+       /*
+        * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
+        * idle runqueue:
+        */
+       if (rq->cfs.load.weight)
+               rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+
+       return rr_interval;
+}
+
+/*
+ * All the scheduling class methods:
+ */
+const struct sched_class fair_sched_class = {
+       .next                   = &idle_sched_class,
+       .enqueue_task           = enqueue_task_fair,
+       .dequeue_task           = dequeue_task_fair,
+       .yield_task             = yield_task_fair,
+       .yield_to_task          = yield_to_task_fair,
+
+       .check_preempt_curr     = check_preempt_wakeup,
+
+       .pick_next_task         = pick_next_task_fair,
+       .put_prev_task          = put_prev_task_fair,
+
+#ifdef CONFIG_SMP
+       .select_task_rq         = select_task_rq_fair,
+
+       .rq_online              = rq_online_fair,
+       .rq_offline             = rq_offline_fair,
+
+       .task_waking            = task_waking_fair,
+#endif
+
+       .set_curr_task          = set_curr_task_fair,
+       .task_tick              = task_tick_fair,
+       .task_fork              = task_fork_fair,
+
+       .prio_changed           = prio_changed_fair,
+       .switched_from          = switched_from_fair,
+       .switched_to            = switched_to_fair,
+
+       .get_rr_interval        = get_rr_interval_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       .task_move_group        = task_move_group_fair,
+#endif
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_cfs_stats(struct seq_file *m, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+               print_cfs_rq(m, cpu, cfs_rq);
+       rcu_read_unlock();
+}
+#endif
+
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+       open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+
+#ifdef CONFIG_NO_HZ
+       zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+#endif
+#endif /* SMP */
+
+}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
new file mode 100644 (file)
index 0000000..e61fd73
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+
+/*
+ * Place new tasks ahead so that they do not starve already running
+ * tasks
+ */
+SCHED_FEAT(START_DEBIT, true)
+
+/*
+ * Based on load and program behaviour, see if it makes sense to place
+ * a newly woken task on the same cpu as the task that woke it --
+ * improve cache locality. Typically used with SYNC wakeups as
+ * generated by pipes and the like, see also SYNC_WAKEUPS.
+ */
+SCHED_FEAT(AFFINE_WAKEUPS, true)
+
+/*
+ * Prefer to schedule the task we woke last (assuming it failed
+ * wakeup-preemption), since its likely going to consume data we
+ * touched, increases cache locality.
+ */
+SCHED_FEAT(NEXT_BUDDY, false)
+
+/*
+ * Prefer to schedule the task that ran last (when we did
+ * wake-preempt) as that likely will touch the same data, increases
+ * cache locality.
+ */
+SCHED_FEAT(LAST_BUDDY, true)
+
+/*
+ * Consider buddies to be cache hot, decreases the likelyness of a
+ * cache buddy being migrated away, increases cache locality.
+ */
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
+
+/*
+ * Use arch dependent cpu power functions
+ */
+SCHED_FEAT(ARCH_POWER, false)
+
+SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(DOUBLE_TICK, false)
+SCHED_FEAT(LB_BIAS, true)
+
+/*
+ * Spin-wait on mutex acquisition when the mutex owner is running on
+ * another cpu -- assumes that when the owner is running, it will soon
+ * release the lock. Decreases scheduling overhead.
+ */
+SCHED_FEAT(OWNER_SPIN, true)
+
+/*
+ * Decrement CPU power based on time not spent running tasks
+ */
+SCHED_FEAT(NONTASK_POWER, true)
+
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, true)
+
+SCHED_FEAT(FORCE_SD_OVERLAP, false)
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
new file mode 100644 (file)
index 0000000..91b4c95
--- /dev/null
@@ -0,0 +1,99 @@
+#include "sched.h"
+
+/*
+ * idle-task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE tasks which are
+ *  handled in sched_fair.c)
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+{
+       return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+       resched_task(rq->idle);
+}
+
+static struct task_struct *pick_next_task_idle(struct rq *rq)
+{
+       schedstat_inc(rq, sched_goidle);
+       calc_load_account_idle(rq);
+       return rq->idle;
+}
+
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+       raw_spin_unlock_irq(&rq->lock);
+       printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+       dump_stack();
+       raw_spin_lock_irq(&rq->lock);
+}
+
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+       BUG();
+}
+
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+       BUG();
+}
+
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+       return 0;
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+       /* .next is NULL */
+       /* no enqueue/yield_task for idle tasks */
+
+       /* dequeue is not valid, we print a debug message there: */
+       .dequeue_task           = dequeue_task_idle,
+
+       .check_preempt_curr     = check_preempt_curr_idle,
+
+       .pick_next_task         = pick_next_task_idle,
+       .put_prev_task          = put_prev_task_idle,
+
+#ifdef CONFIG_SMP
+       .select_task_rq         = select_task_rq_idle,
+#endif
+
+       .set_curr_task          = set_curr_task_idle,
+       .task_tick              = task_tick_idle,
+
+       .get_rr_interval        = get_rr_interval_idle,
+
+       .prio_changed           = prio_changed_idle,
+       .switched_to            = switched_to_idle,
+};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
new file mode 100644 (file)
index 0000000..3640ebb
--- /dev/null
@@ -0,0 +1,2048 @@
+/*
+ * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
+ * policies)
+ */
+
+#include "sched.h"
+
+#include <linux/slab.h>
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+struct rt_bandwidth def_rt_bandwidth;
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+       struct rt_bandwidth *rt_b =
+               container_of(timer, struct rt_bandwidth, rt_period_timer);
+       ktime_t now;
+       int overrun;
+       int idle = 0;
+
+       for (;;) {
+               now = hrtimer_cb_get_time(timer);
+               overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+               if (!overrun)
+                       break;
+
+               idle = do_sched_rt_period_timer(rt_b, overrun);
+       }
+
+       return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+       rt_b->rt_period = ns_to_ktime(period);
+       rt_b->rt_runtime = runtime;
+
+       raw_spin_lock_init(&rt_b->rt_runtime_lock);
+
+       hrtimer_init(&rt_b->rt_period_timer,
+                       CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+               return;
+
+       if (hrtimer_active(&rt_b->rt_period_timer))
+               return;
+
+       raw_spin_lock(&rt_b->rt_runtime_lock);
+       start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+       raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+       struct rt_prio_array *array;
+       int i;
+
+       array = &rt_rq->active;
+       for (i = 0; i < MAX_RT_PRIO; i++) {
+               INIT_LIST_HEAD(array->queue + i);
+               __clear_bit(i, array->bitmap);
+       }
+       /* delimiter for bitsearch: */
+       __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
+       rt_rq->rt_nr_migratory = 0;
+       rt_rq->overloaded = 0;
+       plist_head_init(&rt_rq->pushable_tasks);
+#endif
+
+       rt_rq->rt_time = 0;
+       rt_rq->rt_throttled = 0;
+       rt_rq->rt_runtime = 0;
+       raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+       hrtimer_cancel(&rt_b->rt_period_timer);
+}
+
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+       WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
+       return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+       return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       return rt_se->rt_rq;
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+       int i;
+
+       if (tg->rt_se)
+               destroy_rt_bandwidth(&tg->rt_bandwidth);
+
+       for_each_possible_cpu(i) {
+               if (tg->rt_rq)
+                       kfree(tg->rt_rq[i]);
+               if (tg->rt_se)
+                       kfree(tg->rt_se[i]);
+       }
+
+       kfree(tg->rt_rq);
+       kfree(tg->rt_se);
+}
+
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+               struct sched_rt_entity *rt_se, int cpu,
+               struct sched_rt_entity *parent)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->rt_nr_boosted = 0;
+       rt_rq->rq = rq;
+       rt_rq->tg = tg;
+
+       tg->rt_rq[cpu] = rt_rq;
+       tg->rt_se[cpu] = rt_se;
+
+       if (!rt_se)
+               return;
+
+       if (!parent)
+               rt_se->rt_rq = &rq->rt;
+       else
+               rt_se->rt_rq = parent->my_q;
+
+       rt_se->my_q = rt_rq;
+       rt_se->parent = parent;
+       INIT_LIST_HEAD(&rt_se->run_list);
+}
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+       struct rt_rq *rt_rq;
+       struct sched_rt_entity *rt_se;
+       int i;
+
+       tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+       if (!tg->rt_rq)
+               goto err;
+       tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+       if (!tg->rt_se)
+               goto err;
+
+       init_rt_bandwidth(&tg->rt_bandwidth,
+                       ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+
+       for_each_possible_cpu(i) {
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
+               if (!rt_rq)
+                       goto err;
+
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
+               if (!rt_se)
+                       goto err_free_rq;
+
+               init_rt_rq(rt_rq, cpu_rq(i));
+               rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+       }
+
+       return 1;
+
+err_free_rq:
+       kfree(rt_rq);
+err:
+       return 0;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+#define rt_entity_is_task(rt_se) (1)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+       return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+       return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+       struct task_struct *p = rt_task_of(rt_se);
+       struct rq *rq = task_rq(p);
+
+       return &rq->rt;
+}
+
+void free_rt_sched_group(struct task_group *tg) { }
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+       return 1;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+
+static inline int rt_overloaded(struct rq *rq)
+{
+       return atomic_read(&rq->rd->rto_count);
+}
+
+static inline void rt_set_overload(struct rq *rq)
+{
+       if (!rq->online)
+               return;
+
+       cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
+       /*
+        * Make sure the mask is visible before we set
+        * the overload count. That is checked to determine
+        * if we should look at the mask. It would be a shame
+        * if we looked at the mask, but the mask was not
+        * updated yet.
+        */
+       wmb();
+       atomic_inc(&rq->rd->rto_count);
+}
+
+static inline void rt_clear_overload(struct rq *rq)
+{
+       if (!rq->online)
+               return;
+
+       /* the order here really doesn't matter */
+       atomic_dec(&rq->rd->rto_count);
+       cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
+}
+
+static void update_rt_migration(struct rt_rq *rt_rq)
+{
+       if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+               if (!rt_rq->overloaded) {
+                       rt_set_overload(rq_of_rt_rq(rt_rq));
+                       rt_rq->overloaded = 1;
+               }
+       } else if (rt_rq->overloaded) {
+               rt_clear_overload(rq_of_rt_rq(rt_rq));
+               rt_rq->overloaded = 0;
+       }
+}
+
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       if (!rt_entity_is_task(rt_se))
+               return;
+
+       rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+       rt_rq->rt_nr_total++;
+       if (rt_se->nr_cpus_allowed > 1)
+               rt_rq->rt_nr_migratory++;
+
+       update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       if (!rt_entity_is_task(rt_se))
+               return;
+
+       rt_rq = &rq_of_rt_rq(rt_rq)->rt;
+
+       rt_rq->rt_nr_total--;
+       if (rt_se->nr_cpus_allowed > 1)
+               rt_rq->rt_nr_migratory--;
+
+       update_rt_migration(rt_rq);
+}
+
+static inline int has_pushable_tasks(struct rq *rq)
+{
+       return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_node_init(&p->pushable_tasks, p->prio);
+       plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+       /* Update the highest prio pushable task */
+       if (p->prio < rq->rt.highest_prio.next)
+               rq->rt.highest_prio.next = p->prio;
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+
+       /* Update the new highest prio pushable task */
+       if (has_pushable_tasks(rq)) {
+               p = plist_first_entry(&rq->rt.pushable_tasks,
+                                     struct task_struct, pushable_tasks);
+               rq->rt.highest_prio.next = p->prio;
+       } else
+               rq->rt.highest_prio.next = MAX_RT_PRIO;
+}
+
+#else
+
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
+#endif /* CONFIG_SMP */
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+       return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+       if (!rt_rq->tg)
+               return RUNTIME_INF;
+
+       return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+       return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
+}
+
+typedef struct task_group *rt_rq_iter_t;
+
+static inline struct task_group *next_task_group(struct task_group *tg)
+{
+       do {
+               tg = list_entry_rcu(tg->list.next,
+                       typeof(struct task_group), list);
+       } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
+
+       if (&tg->list == &task_groups)
+               tg = NULL;
+
+       return tg;
+}
+
+#define for_each_rt_rq(rt_rq, iter, rq)                                        \
+       for (iter = container_of(&task_groups, typeof(*iter), list);    \
+               (iter = next_task_group(iter)) &&                       \
+               (rt_rq = iter->rt_rq[cpu_of(rq)]);)
+
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+       list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+
+#define for_each_sched_rt_entity(rt_se) \
+       for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+       return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+       struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+       struct sched_rt_entity *rt_se;
+
+       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+       rt_se = rt_rq->tg->rt_se[cpu];
+
+       if (rt_rq->rt_nr_running) {
+               if (rt_se && !on_rt_rq(rt_se))
+                       enqueue_rt_entity(rt_se, false);
+               if (rt_rq->highest_prio.curr < curr->prio)
+                       resched_task(curr);
+       }
+}
+
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+       struct sched_rt_entity *rt_se;
+       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+
+       rt_se = rt_rq->tg->rt_se[cpu];
+
+       if (rt_se && on_rt_rq(rt_se))
+               dequeue_rt_entity(rt_se);
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+       return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = group_rt_rq(rt_se);
+       struct task_struct *p;
+
+       if (rt_rq)
+               return !!rt_rq->rt_nr_boosted;
+
+       p = rt_task_of(rt_se);
+       return p->prio != p->normal_prio;
+}
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+       return cpu_rq(smp_processor_id())->rd->span;
+}
+#else
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+       return cpu_online_mask;
+}
+#endif
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+       return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+       return &rt_rq->tg->rt_bandwidth;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
+{
+       return rt_rq->rt_runtime;
+}
+
+static inline u64 sched_rt_period(struct rt_rq *rt_rq)
+{
+       return ktime_to_ns(def_rt_bandwidth.rt_period);
+}
+
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+       for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+       for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+#define for_each_sched_rt_entity(rt_se) \
+       for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+       return NULL;
+}
+
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+{
+       if (rt_rq->rt_nr_running)
+               resched_task(rq_of_rt_rq(rt_rq)->curr);
+}
+
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
+{
+}
+
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+       return rt_rq->rt_throttled;
+}
+
+static inline const struct cpumask *sched_rt_period_mask(void)
+{
+       return cpu_online_mask;
+}
+
+static inline
+struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
+{
+       return &cpu_rq(cpu)->rt;
+}
+
+static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
+{
+       return &def_rt_bandwidth;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
+static int do_balance_runtime(struct rt_rq *rt_rq)
+{
+       struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       int i, weight, more = 0;
+       u64 rt_period;
+
+       weight = cpumask_weight(rd->span);
+
+       raw_spin_lock(&rt_b->rt_runtime_lock);
+       rt_period = ktime_to_ns(rt_b->rt_period);
+       for_each_cpu(i, rd->span) {
+               struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+               s64 diff;
+
+               if (iter == rt_rq)
+                       continue;
+
+               raw_spin_lock(&iter->rt_runtime_lock);
+               /*
+                * Either all rqs have inf runtime and there's nothing to steal
+                * or __disable_runtime() below sets a specific rq to inf to
+                * indicate its been disabled and disalow stealing.
+                */
+               if (iter->rt_runtime == RUNTIME_INF)
+                       goto next;
+
+               /*
+                * From runqueues with spare time, take 1/n part of their
+                * spare time, but no more than our period.
+                */
+               diff = iter->rt_runtime - iter->rt_time;
+               if (diff > 0) {
+                       diff = div_u64((u64)diff, weight);
+                       if (rt_rq->rt_runtime + diff > rt_period)
+                               diff = rt_period - rt_rq->rt_runtime;
+                       iter->rt_runtime -= diff;
+                       rt_rq->rt_runtime += diff;
+                       more = 1;
+                       if (rt_rq->rt_runtime == rt_period) {
+                               raw_spin_unlock(&iter->rt_runtime_lock);
+                               break;
+                       }
+               }
+next:
+               raw_spin_unlock(&iter->rt_runtime_lock);
+       }
+       raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+       return more;
+}
+
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
+static void __disable_runtime(struct rq *rq)
+{
+       struct root_domain *rd = rq->rd;
+       rt_rq_iter_t iter;
+       struct rt_rq *rt_rq;
+
+       if (unlikely(!scheduler_running))
+               return;
+
+       for_each_rt_rq(rt_rq, iter, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+               s64 want;
+               int i;
+
+               raw_spin_lock(&rt_b->rt_runtime_lock);
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+               /*
+                * Either we're all inf and nobody needs to borrow, or we're
+                * already disabled and thus have nothing to do, or we have
+                * exactly the right amount of runtime to take out.
+                */
+               if (rt_rq->rt_runtime == RUNTIME_INF ||
+                               rt_rq->rt_runtime == rt_b->rt_runtime)
+                       goto balanced;
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+
+               /*
+                * Calculate the difference between what we started out with
+                * and what we current have, that's the amount of runtime
+                * we lend and now have to reclaim.
+                */
+               want = rt_b->rt_runtime - rt_rq->rt_runtime;
+
+               /*
+                * Greedy reclaim, take back as much as we can.
+                */
+               for_each_cpu(i, rd->span) {
+                       struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
+                       s64 diff;
+
+                       /*
+                        * Can't reclaim from ourselves or disabled runqueues.
+                        */
+                       if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
+                               continue;
+
+                       raw_spin_lock(&iter->rt_runtime_lock);
+                       if (want > 0) {
+                               diff = min_t(s64, iter->rt_runtime, want);
+                               iter->rt_runtime -= diff;
+                               want -= diff;
+                       } else {
+                               iter->rt_runtime -= want;
+                               want -= want;
+                       }
+                       raw_spin_unlock(&iter->rt_runtime_lock);
+
+                       if (!want)
+                               break;
+               }
+
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+               /*
+                * We cannot be left wanting - that would mean some runtime
+                * leaked out of the system.
+                */
+               BUG_ON(want);
+balanced:
+               /*
+                * Disable all the borrow logic by pretending we have inf
+                * runtime - in which case borrowing doesn't make sense.
+                */
+               rt_rq->rt_runtime = RUNTIME_INF;
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+               raw_spin_unlock(&rt_b->rt_runtime_lock);
+       }
+}
+
+static void disable_runtime(struct rq *rq)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       __disable_runtime(rq);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void __enable_runtime(struct rq *rq)
+{
+       rt_rq_iter_t iter;
+       struct rt_rq *rt_rq;
+
+       if (unlikely(!scheduler_running))
+               return;
+
+       /*
+        * Reset each runqueue's bandwidth settings
+        */
+       for_each_rt_rq(rt_rq, iter, rq) {
+               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+               raw_spin_lock(&rt_b->rt_runtime_lock);
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+               rt_rq->rt_runtime = rt_b->rt_runtime;
+               rt_rq->rt_time = 0;
+               rt_rq->rt_throttled = 0;
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+               raw_spin_unlock(&rt_b->rt_runtime_lock);
+       }
+}
+
+static void enable_runtime(struct rq *rq)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       __enable_runtime(rq);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+       int cpu = (int)(long)hcpu;
+
+       switch (action) {
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               disable_runtime(cpu_rq(cpu));
+               return NOTIFY_OK;
+
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               enable_runtime(cpu_rq(cpu));
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int balance_runtime(struct rt_rq *rt_rq)
+{
+       int more = 0;
+
+       if (!sched_feat(RT_RUNTIME_SHARE))
+               return more;
+
+       if (rt_rq->rt_time > rt_rq->rt_runtime) {
+               raw_spin_unlock(&rt_rq->rt_runtime_lock);
+               more = do_balance_runtime(rt_rq);
+               raw_spin_lock(&rt_rq->rt_runtime_lock);
+       }
+
+       return more;
+}
+#else /* !CONFIG_SMP */
+static inline int balance_runtime(struct rt_rq *rt_rq)
+{
+       return 0;
+}
+#endif /* CONFIG_SMP */
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
+{
+       int i, idle = 1;
+       const struct cpumask *span;
+
+       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+               return 1;
+
+       span = sched_rt_period_mask();
+       for_each_cpu(i, span) {
+               int enqueue = 0;
+               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
+               struct rq *rq = rq_of_rt_rq(rt_rq);
+
+               raw_spin_lock(&rq->lock);
+               if (rt_rq->rt_time) {
+                       u64 runtime;
+
+                       raw_spin_lock(&rt_rq->rt_runtime_lock);
+                       if (rt_rq->rt_throttled)
+                               balance_runtime(rt_rq);
+                       runtime = rt_rq->rt_runtime;
+                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+                               rt_rq->rt_throttled = 0;
+                               enqueue = 1;
+
+                               /*
+                                * Force a clock update if the CPU was idle,
+                                * lest wakeup -> unthrottle time accumulate.
+                                */
+                               if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+                                       rq->skip_clock_update = -1;
+                       }
+                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
+                               idle = 0;
+                       raw_spin_unlock(&rt_rq->rt_runtime_lock);
+               } else if (rt_rq->rt_nr_running) {
+                       idle = 0;
+                       if (!rt_rq_throttled(rt_rq))
+                               enqueue = 1;
+               }
+
+               if (enqueue)
+                       sched_rt_rq_enqueue(rt_rq);
+               raw_spin_unlock(&rq->lock);
+       }
+
+       return idle;
+}
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_RT_GROUP_SCHED
+       struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+       if (rt_rq)
+               return rt_rq->highest_prio.curr;
+#endif
+
+       return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
+{
+       u64 runtime = sched_rt_runtime(rt_rq);
+
+       if (rt_rq->rt_throttled)
+               return rt_rq_throttled(rt_rq);
+
+       if (runtime >= sched_rt_period(rt_rq))
+               return 0;
+
+       balance_runtime(rt_rq);
+       runtime = sched_rt_runtime(rt_rq);
+       if (runtime == RUNTIME_INF)
+               return 0;
+
+       if (rt_rq->rt_time > runtime) {
+               rt_rq->rt_throttled = 1;
+               printk_once(KERN_WARNING "sched: RT throttling activated\n");
+               if (rt_rq_throttled(rt_rq)) {
+                       sched_rt_rq_dequeue(rt_rq);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static void update_curr_rt(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+       struct sched_rt_entity *rt_se = &curr->rt;
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       u64 delta_exec;
+
+       if (curr->sched_class != &rt_sched_class)
+               return;
+
+       delta_exec = rq->clock_task - curr->se.exec_start;
+       if (unlikely((s64)delta_exec < 0))
+               delta_exec = 0;
+
+       schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
+
+       curr->se.sum_exec_runtime += delta_exec;
+       account_group_exec_runtime(curr, delta_exec);
+
+       curr->se.exec_start = rq->clock_task;
+       cpuacct_charge(curr, delta_exec);
+
+       sched_rt_avg_update(rq, delta_exec);
+
+       if (!rt_bandwidth_enabled())
+               return;
+
+       for_each_sched_rt_entity(rt_se) {
+               rt_rq = rt_rq_of_se(rt_se);
+
+               if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+                       raw_spin_lock(&rt_rq->rt_runtime_lock);
+                       rt_rq->rt_time += delta_exec;
+                       if (sched_rt_runtime_exceeded(rt_rq))
+                               resched_task(curr);
+                       raw_spin_unlock(&rt_rq->rt_runtime_lock);
+               }
+       }
+}
+
+#if defined CONFIG_SMP
+
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       if (rq->online && prio < prev_prio)
+               cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
+}
+
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+
+       if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+               cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+}
+
+#else /* CONFIG_SMP */
+
+static inline
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
+
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+       int prev_prio = rt_rq->highest_prio.curr;
+
+       if (prio < prev_prio)
+               rt_rq->highest_prio.curr = prio;
+
+       inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+       int prev_prio = rt_rq->highest_prio.curr;
+
+       if (rt_rq->rt_nr_running) {
+
+               WARN_ON(prio < prev_prio);
+
+               /*
+                * This may have been our highest task, and therefore
+                * we may have some recomputation to do
+                */
+               if (prio == prev_prio) {
+                       struct rt_prio_array *array = &rt_rq->active;
+
+                       rt_rq->highest_prio.curr =
+                               sched_find_first_bit(array->bitmap);
+               }
+
+       } else
+               rt_rq->highest_prio.curr = MAX_RT_PRIO;
+
+       dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       if (rt_se_boosted(rt_se))
+               rt_rq->rt_nr_boosted++;
+
+       if (rt_rq->tg)
+               start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       if (rt_se_boosted(rt_se))
+               rt_rq->rt_nr_boosted--;
+
+       WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       int prio = rt_se_prio(rt_se);
+
+       WARN_ON(!rt_prio(prio));
+       rt_rq->rt_nr_running++;
+
+       inc_rt_prio(rt_rq, prio);
+       inc_rt_migration(rt_se, rt_rq);
+       inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+       WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+       WARN_ON(!rt_rq->rt_nr_running);
+       rt_rq->rt_nr_running--;
+
+       dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+       dec_rt_migration(rt_se, rt_rq);
+       dec_rt_group(rt_se, rt_rq);
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+{
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       struct rt_prio_array *array = &rt_rq->active;
+       struct rt_rq *group_rq = group_rt_rq(rt_se);
+       struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+       /*
+        * Don't enqueue the group if its throttled, or when empty.
+        * The latter is a consequence of the former when a child group
+        * get throttled and the current group doesn't have any other
+        * active members.
+        */
+       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+               return;
+
+       if (!rt_rq->rt_nr_running)
+               list_add_leaf_rt_rq(rt_rq);
+
+       if (head)
+               list_add(&rt_se->run_list, queue);
+       else
+               list_add_tail(&rt_se->run_list, queue);
+       __set_bit(rt_se_prio(rt_se), array->bitmap);
+
+       inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       struct rt_prio_array *array = &rt_rq->active;
+
+       list_del_init(&rt_se->run_list);
+       if (list_empty(array->queue + rt_se_prio(rt_se)))
+               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+       dec_rt_tasks(rt_se, rt_rq);
+       if (!rt_rq->rt_nr_running)
+               list_del_leaf_rt_rq(rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ */
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+{
+       struct sched_rt_entity *back = NULL;
+
+       for_each_sched_rt_entity(rt_se) {
+               rt_se->back = back;
+               back = rt_se;
+       }
+
+       for (rt_se = back; rt_se; rt_se = rt_se->back) {
+               if (on_rt_rq(rt_se))
+                       __dequeue_rt_entity(rt_se);
+       }
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+{
+       dequeue_rt_stack(rt_se);
+       for_each_sched_rt_entity(rt_se)
+               __enqueue_rt_entity(rt_se, head);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+       dequeue_rt_stack(rt_se);
+
+       for_each_sched_rt_entity(rt_se) {
+               struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+               if (rt_rq && rt_rq->rt_nr_running)
+                       __enqueue_rt_entity(rt_se, false);
+       }
+}
+
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+       struct sched_rt_entity *rt_se = &p->rt;
+
+       if (flags & ENQUEUE_WAKEUP)
+               rt_se->timeout = 0;
+
+       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+
+       if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
+
+       inc_nr_running(rq);
+}
+
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+       struct sched_rt_entity *rt_se = &p->rt;
+
+       update_curr_rt(rq);
+       dequeue_rt_entity(rt_se);
+
+       dequeue_pushable_task(rq, p);
+
+       dec_nr_running(rq);
+}
+
+/*
+ * Put task to the head or the end of the run list without the overhead of
+ * dequeue followed by enqueue.
+ */
+static void
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
+{
+       if (on_rt_rq(rt_se)) {
+               struct rt_prio_array *array = &rt_rq->active;
+               struct list_head *queue = array->queue + rt_se_prio(rt_se);
+
+               if (head)
+                       list_move(&rt_se->run_list, queue);
+               else
+                       list_move_tail(&rt_se->run_list, queue);
+       }
+}
+
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
+{
+       struct sched_rt_entity *rt_se = &p->rt;
+       struct rt_rq *rt_rq;
+
+       for_each_sched_rt_entity(rt_se) {
+               rt_rq = rt_rq_of_se(rt_se);
+               requeue_rt_entity(rt_rq, rt_se, head);
+       }
+}
+
+static void yield_task_rt(struct rq *rq)
+{
+       requeue_task_rt(rq, rq->curr, 0);
+}
+
+#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
+static int
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+{
+       struct task_struct *curr;
+       struct rq *rq;
+       int cpu;
+
+       cpu = task_cpu(p);
+
+       if (p->rt.nr_cpus_allowed == 1)
+               goto out;
+
+       /* For anything but wake ups, just return the task_cpu */
+       if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+               goto out;
+
+       rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
+       /*
+        * If the current task on @p's runqueue is an RT task, then
+        * try to see if we can wake this RT task up on another
+        * runqueue. Otherwise simply start this RT task
+        * on its current runqueue.
+        *
+        * We want to avoid overloading runqueues. If the woken
+        * task is a higher priority, then it will stay on this CPU
+        * and the lower prio task should be moved to another CPU.
+        * Even though this will probably make the lower prio task
+        * lose its cache, we do not want to bounce a higher task
+        * around just because it gave up its CPU, perhaps for a
+        * lock?
+        *
+        * For equal prio tasks, we just let the scheduler sort it out.
+        *
+        * Otherwise, just let it ride on the affined RQ and the
+        * post-schedule router will push the preempted task away
+        *
+        * This test is optimistic, if we get it wrong the load-balancer
+        * will have to sort it out.
+        */
+       if (curr && unlikely(rt_task(curr)) &&
+           (curr->rt.nr_cpus_allowed < 2 ||
+            curr->prio <= p->prio) &&
+           (p->rt.nr_cpus_allowed > 1)) {
+               int target = find_lowest_rq(p);
+
+               if (target != -1)
+                       cpu = target;
+       }
+       rcu_read_unlock();
+
+out:
+       return cpu;
+}
+
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+       if (rq->curr->rt.nr_cpus_allowed == 1)
+               return;
+
+       if (p->rt.nr_cpus_allowed != 1
+           && cpupri_find(&rq->rd->cpupri, p, NULL))
+               return;
+
+       if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+               return;
+
+       /*
+        * There appears to be other cpus that can accept
+        * current and none to run 'p', so lets reschedule
+        * to try and push current away:
+        */
+       requeue_task_rt(rq, p, 1);
+       resched_task(rq->curr);
+}
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+{
+       if (p->prio < rq->curr->prio) {
+               resched_task(rq->curr);
+               return;
+       }
+
+#ifdef CONFIG_SMP
+       /*
+        * If:
+        *
+        * - the newly woken task is of equal priority to the current task
+        * - the newly woken task is non-migratable while current is migratable
+        * - current will be preempted on the next reschedule
+        *
+        * we should check to see if current can readily move to a different
+        * cpu.  If so, we will reschedule to allow the push logic to try
+        * to move current somewhere else, making room for our non-migratable
+        * task.
+        */
+       if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+               check_preempt_equal_prio(rq, p);
+#endif
+}
+
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+                                                  struct rt_rq *rt_rq)
+{
+       struct rt_prio_array *array = &rt_rq->active;
+       struct sched_rt_entity *next = NULL;
+       struct list_head *queue;
+       int idx;
+
+       idx = sched_find_first_bit(array->bitmap);
+       BUG_ON(idx >= MAX_RT_PRIO);
+
+       queue = array->queue + idx;
+       next = list_entry(queue->next, struct sched_rt_entity, run_list);
+
+       return next;
+}
+
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
+{
+       struct sched_rt_entity *rt_se;
+       struct task_struct *p;
+       struct rt_rq *rt_rq;
+
+       rt_rq = &rq->rt;
+
+       if (!rt_rq->rt_nr_running)
+               return NULL;
+
+       if (rt_rq_throttled(rt_rq))
+               return NULL;
+
+       do {
+               rt_se = pick_next_rt_entity(rq, rt_rq);
+               BUG_ON(!rt_se);
+               rt_rq = group_rt_rq(rt_se);
+       } while (rt_rq);
+
+       p = rt_task_of(rt_se);
+       p->se.exec_start = rq->clock_task;
+
+       return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+       struct task_struct *p = _pick_next_task_rt(rq);
+
+       /* The running task is never eligible for pushing */
+       if (p)
+               dequeue_pushable_task(rq, p);
+
+#ifdef CONFIG_SMP
+       /*
+        * We detect this state here so that we can avoid taking the RQ
+        * lock again later if there is no need to push
+        */
+       rq->post_schedule = has_pushable_tasks(rq);
+#endif
+
+       return p;
+}
+
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+{
+       update_curr_rt(rq);
+
+       /*
+        * The previous task needs to be made eligible for pushing
+        * if it is still active
+        */
+       if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
+}
+
+#ifdef CONFIG_SMP
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+       if (!task_running(rq, p) &&
+           (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
+           (p->rt.nr_cpus_allowed > 1))
+               return 1;
+       return 0;
+}
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+{
+       struct task_struct *next = NULL;
+       struct sched_rt_entity *rt_se;
+       struct rt_prio_array *array;
+       struct rt_rq *rt_rq;
+       int idx;
+
+       for_each_leaf_rt_rq(rt_rq, rq) {
+               array = &rt_rq->active;
+               idx = sched_find_first_bit(array->bitmap);
+next_idx:
+               if (idx >= MAX_RT_PRIO)
+                       continue;
+               if (next && next->prio < idx)
+                       continue;
+               list_for_each_entry(rt_se, array->queue + idx, run_list) {
+                       struct task_struct *p;
+
+                       if (!rt_entity_is_task(rt_se))
+                               continue;
+
+                       p = rt_task_of(rt_se);
+                       if (pick_rt_task(rq, p, cpu)) {
+                               next = p;
+                               break;
+                       }
+               }
+               if (!next) {
+                       idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+                       goto next_idx;
+               }
+       }
+
+       return next;
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+
+static int find_lowest_rq(struct task_struct *task)
+{
+       struct sched_domain *sd;
+       struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
+       int this_cpu = smp_processor_id();
+       int cpu      = task_cpu(task);
+
+       /* Make sure the mask is initialized first */
+       if (unlikely(!lowest_mask))
+               return -1;
+
+       if (task->rt.nr_cpus_allowed == 1)
+               return -1; /* No other targets possible */
+
+       if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
+               return -1; /* No targets found */
+
+       /*
+        * At this point we have built a mask of cpus representing the
+        * lowest priority tasks in the system.  Now we want to elect
+        * the best one based on our affinity and topology.
+        *
+        * We prioritize the last cpu that the task executed on since
+        * it is most likely cache-hot in that location.
+        */
+       if (cpumask_test_cpu(cpu, lowest_mask))
+               return cpu;
+
+       /*
+        * Otherwise, we consult the sched_domains span maps to figure
+        * out which cpu is logically closest to our hot cache data.
+        */
+       if (!cpumask_test_cpu(this_cpu, lowest_mask))
+               this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+
+       rcu_read_lock();
+       for_each_domain(cpu, sd) {
+               if (sd->flags & SD_WAKE_AFFINE) {
+                       int best_cpu;
+
+                       /*
+                        * "this_cpu" is cheaper to preempt than a
+                        * remote processor.
+                        */
+                       if (this_cpu != -1 &&
+                           cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                               rcu_read_unlock();
+                               return this_cpu;
+                       }
+
+                       best_cpu = cpumask_first_and(lowest_mask,
+                                                    sched_domain_span(sd));
+                       if (best_cpu < nr_cpu_ids) {
+                               rcu_read_unlock();
+                               return best_cpu;
+                       }
+               }
+       }
+       rcu_read_unlock();
+
+       /*
+        * And finally, if there were no matches within the domains
+        * just give the caller *something* to work with from the compatible
+        * locations.
+        */
+       if (this_cpu != -1)
+               return this_cpu;
+
+       cpu = cpumask_any(lowest_mask);
+       if (cpu < nr_cpu_ids)
+               return cpu;
+       return -1;
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+       struct rq *lowest_rq = NULL;
+       int tries;
+       int cpu;
+
+       for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+               cpu = find_lowest_rq(task);
+
+               if ((cpu == -1) || (cpu == rq->cpu))
+                       break;
+
+               lowest_rq = cpu_rq(cpu);
+
+               /* if the prio of this runqueue changed, try again */
+               if (double_lock_balance(rq, lowest_rq)) {
+                       /*
+                        * We had to unlock the run queue. In
+                        * the mean time, task could have
+                        * migrated already or had its affinity changed.
+                        * Also make sure that it wasn't scheduled on its rq.
+                        */
+                       if (unlikely(task_rq(task) != rq ||
+                                    !cpumask_test_cpu(lowest_rq->cpu,
+                                                      tsk_cpus_allowed(task)) ||
+                                    task_running(rq, task) ||
+                                    !task->on_rq)) {
+
+                               raw_spin_unlock(&lowest_rq->lock);
+                               lowest_rq = NULL;
+                               break;
+                       }
+               }
+
+               /* If this rq is still suitable use it. */
+               if (lowest_rq->rt.highest_prio.curr > task->prio)
+                       break;
+
+               /* try again */
+               double_unlock_balance(rq, lowest_rq);
+               lowest_rq = NULL;
+       }
+
+       return lowest_rq;
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+       struct task_struct *p;
+
+       if (!has_pushable_tasks(rq))
+               return NULL;
+
+       p = plist_first_entry(&rq->rt.pushable_tasks,
+                             struct task_struct, pushable_tasks);
+
+       BUG_ON(rq->cpu != task_cpu(p));
+       BUG_ON(task_current(rq, p));
+       BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+       BUG_ON(!p->on_rq);
+       BUG_ON(!rt_task(p));
+
+       return p;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *rq)
+{
+       struct task_struct *next_task;
+       struct rq *lowest_rq;
+       int ret = 0;
+
+       if (!rq->rt.overloaded)
+               return 0;
+
+       next_task = pick_next_pushable_task(rq);
+       if (!next_task)
+               return 0;
+
+retry:
+       if (unlikely(next_task == rq->curr)) {
+               WARN_ON(1);
+               return 0;
+       }
+
+       /*
+        * It's possible that the next_task slipped in of
+        * higher priority than current. If that's the case
+        * just reschedule current.
+        */
+       if (unlikely(next_task->prio < rq->curr->prio)) {
+               resched_task(rq->curr);
+               return 0;
+       }
+
+       /* We might release rq lock */
+       get_task_struct(next_task);
+
+       /* find_lock_lowest_rq locks the rq if found */
+       lowest_rq = find_lock_lowest_rq(next_task, rq);
+       if (!lowest_rq) {
+               struct task_struct *task;
+               /*
+                * find_lock_lowest_rq releases rq->lock
+                * so it is possible that next_task has migrated.
+                *
+                * We need to make sure that the task is still on the same
+                * run-queue and is also still the next task eligible for
+                * pushing.
+                */
+               task = pick_next_pushable_task(rq);
+               if (task_cpu(next_task) == rq->cpu && task == next_task) {
+                       /*
+                        * The task hasn't migrated, and is still the next
+                        * eligible task, but we failed to find a run-queue
+                        * to push it to.  Do not retry in this case, since
+                        * other cpus will pull from us when ready.
+                        */
+                       goto out;
+               }
+
+               if (!task)
+                       /* No more tasks, just exit */
+                       goto out;
+
+               /*
+                * Something has shifted, try again.
+                */
+               put_task_struct(next_task);
+               next_task = task;
+               goto retry;
+       }
+
+       deactivate_task(rq, next_task, 0);
+       set_task_cpu(next_task, lowest_rq->cpu);
+       activate_task(lowest_rq, next_task, 0);
+       ret = 1;
+
+       resched_task(lowest_rq->curr);
+
+       double_unlock_balance(rq, lowest_rq);
+
+out:
+       put_task_struct(next_task);
+
+       return ret;
+}
+
+static void push_rt_tasks(struct rq *rq)
+{
+       /* push_rt_task will return true if it moved an RT */
+       while (push_rt_task(rq))
+               ;
+}
+
+static int pull_rt_task(struct rq *this_rq)
+{
+       int this_cpu = this_rq->cpu, ret = 0, cpu;
+       struct task_struct *p;
+       struct rq *src_rq;
+
+       if (likely(!rt_overloaded(this_rq)))
+               return 0;
+
+       for_each_cpu(cpu, this_rq->rd->rto_mask) {
+               if (this_cpu == cpu)
+                       continue;
+
+               src_rq = cpu_rq(cpu);
+
+               /*
+                * Don't bother taking the src_rq->lock if the next highest
+                * task is known to be lower-priority than our current task.
+                * This may look racy, but if this value is about to go
+                * logically higher, the src_rq will push this task away.
+                * And if its going logically lower, we do not care
+                */
+               if (src_rq->rt.highest_prio.next >=
+                   this_rq->rt.highest_prio.curr)
+                       continue;
+
+               /*
+                * We can potentially drop this_rq's lock in
+                * double_lock_balance, and another CPU could
+                * alter this_rq
+                */
+               double_lock_balance(this_rq, src_rq);
+
+               /*
+                * Are there still pullable RT tasks?
+                */
+               if (src_rq->rt.rt_nr_running <= 1)
+                       goto skip;
+
+               p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+               /*
+                * Do we have an RT task that preempts
+                * the to-be-scheduled task?
+                */
+               if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
+                       WARN_ON(p == src_rq->curr);
+                       WARN_ON(!p->on_rq);
+
+                       /*
+                        * There's a chance that p is higher in priority
+                        * than what's currently running on its cpu.
+                        * This is just that p is wakeing up and hasn't
+                        * had a chance to schedule. We only pull
+                        * p if it is lower in priority than the
+                        * current task on the run queue
+                        */
+                       if (p->prio < src_rq->curr->prio)
+                               goto skip;
+
+                       ret = 1;
+
+                       deactivate_task(src_rq, p, 0);
+                       set_task_cpu(p, this_cpu);
+                       activate_task(this_rq, p, 0);
+                       /*
+                        * We continue with the search, just in
+                        * case there's an even higher prio task
+                        * in another runqueue. (low likelihood
+                        * but possible)
+                        */
+               }
+skip:
+               double_unlock_balance(this_rq, src_rq);
+       }
+
+       return ret;
+}
+
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
+{
+       /* Try to pull RT tasks here if we lower this rq's prio */
+       if (rq->rt.highest_prio.curr > prev->prio)
+               pull_rt_task(rq);
+}
+
+static void post_schedule_rt(struct rq *rq)
+{
+       push_rt_tasks(rq);
+}
+
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
+static void task_woken_rt(struct rq *rq, struct task_struct *p)
+{
+       if (!task_running(rq, p) &&
+           !test_tsk_need_resched(rq->curr) &&
+           has_pushable_tasks(rq) &&
+           p->rt.nr_cpus_allowed > 1 &&
+           rt_task(rq->curr) &&
+           (rq->curr->rt.nr_cpus_allowed < 2 ||
+            rq->curr->prio <= p->prio))
+               push_rt_tasks(rq);
+}
+
+static void set_cpus_allowed_rt(struct task_struct *p,
+                               const struct cpumask *new_mask)
+{
+       int weight = cpumask_weight(new_mask);
+
+       BUG_ON(!rt_task(p));
+
+       /*
+        * Update the migration status of the RQ if we have an RT task
+        * which is running AND changing its weight value.
+        */
+       if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
+               struct rq *rq = task_rq(p);
+
+               if (!task_current(rq, p)) {
+                       /*
+                        * Make sure we dequeue this task from the pushable list
+                        * before going further.  It will either remain off of
+                        * the list because we are no longer pushable, or it
+                        * will be requeued.
+                        */
+                       if (p->rt.nr_cpus_allowed > 1)
+                               dequeue_pushable_task(rq, p);
+
+                       /*
+                        * Requeue if our weight is changing and still > 1
+                        */
+                       if (weight > 1)
+                               enqueue_pushable_task(rq, p);
+
+               }
+
+               if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
+                       rq->rt.rt_nr_migratory++;
+               } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
+                       BUG_ON(!rq->rt.rt_nr_migratory);
+                       rq->rt.rt_nr_migratory--;
+               }
+
+               update_rt_migration(&rq->rt);
+       }
+}
+
+/* Assumes rq->lock is held */
+static void rq_online_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_set_overload(rq);
+
+       __enable_runtime(rq);
+
+       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
+}
+
+/* Assumes rq->lock is held */
+static void rq_offline_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_clear_overload(rq);
+
+       __disable_runtime(rq);
+
+       cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
+{
+       /*
+        * If there are other RT tasks then we will reschedule
+        * and the scheduling of the other RT tasks will handle
+        * the balancing. But if we are the last RT task
+        * we may need to handle the pulling of RT tasks
+        * now.
+        */
+       if (p->on_rq && !rq->rt.rt_nr_running)
+               pull_rt_task(rq);
+}
+
+void init_sched_rt_class(void)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i) {
+               zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+                                       GFP_KERNEL, cpu_to_node(i));
+       }
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
+{
+       int check_resched = 1;
+
+       /*
+        * If we are already running, then there's nothing
+        * that needs to be done. But if we are not running
+        * we may need to preempt the current running task.
+        * If that current running task is also an RT task
+        * then see if we can move to another run queue.
+        */
+       if (p->on_rq && rq->curr != p) {
+#ifdef CONFIG_SMP
+               if (rq->rt.overloaded && push_rt_task(rq) &&
+                   /* Don't resched if we changed runqueues */
+                   rq != task_rq(p))
+                       check_resched = 0;
+#endif /* CONFIG_SMP */
+               if (check_resched && p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
+{
+       if (!p->on_rq)
+               return;
+
+       if (rq->curr == p) {
+#ifdef CONFIG_SMP
+               /*
+                * If our priority decreases while running, we
+                * may need to pull tasks to this runqueue.
+                */
+               if (oldprio < p->prio)
+                       pull_rt_task(rq);
+               /*
+                * If there's a higher priority task waiting to run
+                * then reschedule. Note, the above pull_rt_task
+                * can release the rq lock and p could migrate.
+                * Only reschedule if p is still on the same runqueue.
+                */
+               if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+                       resched_task(p);
+#else
+               /* For UP simply resched on drop of prio */
+               if (oldprio < p->prio)
+                       resched_task(p);
+#endif /* CONFIG_SMP */
+       } else {
+               /*
+                * This task is not running, but if it is
+                * greater than the current running task
+                * then reschedule.
+                */
+               if (p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+       unsigned long soft, hard;
+
+       /* max may change after cur was read, this will be fixed next tick */
+       soft = task_rlimit(p, RLIMIT_RTTIME);
+       hard = task_rlimit_max(p, RLIMIT_RTTIME);
+
+       if (soft != RLIM_INFINITY) {
+               unsigned long next;
+
+               p->rt.timeout++;
+               next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+               if (p->rt.timeout > next)
+                       p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+       }
+}
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
+{
+       update_curr_rt(rq);
+
+       watchdog(rq, p);
+
+       /*
+        * RR tasks need a special form of timeslice management.
+        * FIFO tasks have no timeslices.
+        */
+       if (p->policy != SCHED_RR)
+               return;
+
+       if (--p->rt.time_slice)
+               return;
+
+       p->rt.time_slice = DEF_TIMESLICE;
+
+       /*
+        * Requeue to the end of queue if we are not the only element
+        * on the queue:
+        */
+       if (p->rt.run_list.prev != p->rt.run_list.next) {
+               requeue_task_rt(rq, p, 0);
+               set_tsk_need_resched(p);
+       }
+}
+
+static void set_curr_task_rt(struct rq *rq)
+{
+       struct task_struct *p = rq->curr;
+
+       p->se.exec_start = rq->clock_task;
+
+       /* The running task is never eligible for pushing */
+       dequeue_pushable_task(rq, p);
+}
+
+static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
+{
+       /*
+        * Time slice is 0 for SCHED_FIFO tasks
+        */
+       if (task->policy == SCHED_RR)
+               return DEF_TIMESLICE;
+       else
+               return 0;
+}
+
+const struct sched_class rt_sched_class = {
+       .next                   = &fair_sched_class,
+       .enqueue_task           = enqueue_task_rt,
+       .dequeue_task           = dequeue_task_rt,
+       .yield_task             = yield_task_rt,
+
+       .check_preempt_curr     = check_preempt_curr_rt,
+
+       .pick_next_task         = pick_next_task_rt,
+       .put_prev_task          = put_prev_task_rt,
+
+#ifdef CONFIG_SMP
+       .select_task_rq         = select_task_rq_rt,
+
+       .set_cpus_allowed       = set_cpus_allowed_rt,
+       .rq_online              = rq_online_rt,
+       .rq_offline             = rq_offline_rt,
+       .pre_schedule           = pre_schedule_rt,
+       .post_schedule          = post_schedule_rt,
+       .task_woken             = task_woken_rt,
+       .switched_from          = switched_from_rt,
+#endif
+
+       .set_curr_task          = set_curr_task_rt,
+       .task_tick              = task_tick_rt,
+
+       .get_rr_interval        = get_rr_interval_rt,
+
+       .prio_changed           = prio_changed_rt,
+       .switched_to            = switched_to_rt,
+};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
+
+void print_rt_stats(struct seq_file *m, int cpu)
+{
+       rt_rq_iter_t iter;
+       struct rt_rq *rt_rq;
+
+       rcu_read_lock();
+       for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
+               print_rt_rq(m, cpu, rt_rq);
+       rcu_read_unlock();
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644 (file)
index 0000000..98c0c26
--- /dev/null
@@ -0,0 +1,1166 @@
+
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+
+#include "cpupri.h"
+
+extern __read_mostly int scheduler_running;
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)     (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)     ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)           PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)           ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)      USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO          (USER_PRIO(MAX_PRIO))
+
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)    ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+
+#define NICE_0_LOAD            SCHED_LOAD_SCALE
+#define NICE_0_SHIFT           SCHED_LOAD_SHIFT
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define DEF_TIMESLICE          (100 * HZ / 1000)
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF    ((u64)~0ULL)
+
+static inline int rt_policy(int policy)
+{
+       if (policy == SCHED_FIFO || policy == SCHED_RR)
+               return 1;
+       return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+       return rt_policy(p->policy);
+}
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+       DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+       struct list_head queue[MAX_RT_PRIO];
+};
+
+struct rt_bandwidth {
+       /* nests inside the rq lock: */
+       raw_spinlock_t          rt_runtime_lock;
+       ktime_t                 rt_period;
+       u64                     rt_runtime;
+       struct hrtimer          rt_period_timer;
+};
+
+extern struct mutex sched_domains_mutex;
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+struct rt_rq;
+
+static LIST_HEAD(task_groups);
+
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+       raw_spinlock_t lock;
+       ktime_t period;
+       u64 quota, runtime;
+       s64 hierarchal_quota;
+       u64 runtime_expires;
+
+       int idle, timer_active;
+       struct hrtimer period_timer, slack_timer;
+       struct list_head throttled_cfs_rq;
+
+       /* statistics */
+       int nr_periods, nr_throttled;
+       u64 throttled_time;
+#endif
+};
+
+/* task group related information */
+struct task_group {
+       struct cgroup_subsys_state css;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /* schedulable entities of this group on each cpu */
+       struct sched_entity **se;
+       /* runqueue "owned" by this group on each cpu */
+       struct cfs_rq **cfs_rq;
+       unsigned long shares;
+
+       atomic_t load_weight;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       struct sched_rt_entity **rt_se;
+       struct rt_rq **rt_rq;
+
+       struct rt_bandwidth rt_bandwidth;
+#endif
+
+       struct rcu_head rcu;
+       struct list_head list;
+
+       struct task_group *parent;
+       struct list_head siblings;
+       struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
+
+       struct cfs_bandwidth cfs_bandwidth;
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
+
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
+#define MIN_SHARES     (1UL <<  1)
+#define MAX_SHARES     (1UL << 18)
+#endif
+
+/* Default task group.
+ *     Every task in system belong to this group at bootup.
+ */
+extern struct task_group root_task_group;
+
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+extern int walk_tg_tree_from(struct task_group *from,
+                            tg_visitor down, tg_visitor up, void *data);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+       return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
+extern int tg_nop(struct task_group *tg, void *data);
+
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                       struct sched_entity *se, int cpu,
+                       struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+               struct sched_rt_entity *rt_se, int cpu,
+               struct sched_rt_entity *parent);
+
+#else /* CONFIG_CGROUP_SCHED */
+
+struct cfs_bandwidth { };
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+       struct load_weight load;
+       unsigned long nr_running, h_nr_running;
+
+       u64 exec_clock;
+       u64 min_vruntime;
+#ifndef CONFIG_64BIT
+       u64 min_vruntime_copy;
+#endif
+
+       struct rb_root tasks_timeline;
+       struct rb_node *rb_leftmost;
+
+       struct list_head tasks;
+       struct list_head *balance_iterator;
+
+       /*
+        * 'curr' points to currently running entity on this cfs_rq.
+        * It is set to NULL otherwise (i.e when none are currently running).
+        */
+       struct sched_entity *curr, *next, *last, *skip;
+
+#ifdef CONFIG_SCHED_DEBUG
+       unsigned int nr_spread_over;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+
+       /*
+        * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+        * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+        * (like users, containers etc.)
+        *
+        * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+        * list is used during load balance.
+        */
+       int on_list;
+       struct list_head leaf_cfs_rq_list;
+       struct task_group *tg;  /* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+       /*
+        * the part of load.weight contributed by tasks
+        */
+       unsigned long task_weight;
+
+       /*
+        *   h_load = weight * f(tg)
+        *
+        * Where f(tg) is the recursive weight fraction assigned to
+        * this group.
+        */
+       unsigned long h_load;
+
+       /*
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
+        */
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
+
+       unsigned long load_contribution;
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_CFS_BANDWIDTH
+       int runtime_enabled;
+       u64 runtime_expires;
+       s64 runtime_remaining;
+
+       u64 throttled_timestamp;
+       int throttled, throttle_count;
+       struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+
+static inline int rt_bandwidth_enabled(void)
+{
+       return sysctl_sched_rt_runtime >= 0;
+}
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+       struct rt_prio_array active;
+       unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+       struct {
+               int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+               int next; /* next highest */
+#endif
+       } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+       unsigned long rt_nr_migratory;
+       unsigned long rt_nr_total;
+       int overloaded;
+       struct plist_head pushable_tasks;
+#endif
+       int rt_throttled;
+       u64 rt_time;
+       u64 rt_runtime;
+       /* Nests inside the rq lock: */
+       raw_spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       unsigned long rt_nr_boosted;
+
+       struct rq *rq;
+       struct list_head leaf_rt_rq_list;
+       struct task_group *tg;
+#endif
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+       atomic_t refcount;
+       atomic_t rto_count;
+       struct rcu_head rcu;
+       cpumask_var_t span;
+       cpumask_var_t online;
+
+       /*
+        * The "RT overload" flag: it gets set if a CPU has more than
+        * one runnable RT task.
+        */
+       cpumask_var_t rto_mask;
+       struct cpupri cpupri;
+};
+
+extern struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+       /* runqueue lock: */
+       raw_spinlock_t lock;
+
+       /*
+        * nr_running and cpu_load should be in the same cacheline because
+        * remote CPUs use both these fields when doing load calculation.
+        */
+       unsigned long nr_running;
+       #define CPU_LOAD_IDX_MAX 5
+       unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+       unsigned long last_load_update_tick;
+#ifdef CONFIG_NO_HZ
+       u64 nohz_stamp;
+       unsigned long nohz_flags;
+#endif
+       int skip_clock_update;
+
+       /* capture load from *all* tasks on this cpu: */
+       struct load_weight load;
+       unsigned long nr_load_updates;
+       u64 nr_switches;
+
+       struct cfs_rq cfs;
+       struct rt_rq rt;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /* list of leaf cfs_rq on this cpu: */
+       struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+       struct list_head leaf_rt_rq_list;
+#endif
+
+       /*
+        * This is part of a global counter where only the total sum
+        * over all CPUs matters. A task can increase this counter on
+        * one CPU and if it got migrated afterwards it may decrease
+        * it on another CPU. Always updated under the runqueue lock:
+        */
+       unsigned long nr_uninterruptible;
+
+       struct task_struct *curr, *idle, *stop;
+       unsigned long next_balance;
+       struct mm_struct *prev_mm;
+
+       u64 clock;
+       u64 clock_task;
+
+       atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+       struct root_domain *rd;
+       struct sched_domain *sd;
+
+       unsigned long cpu_power;
+
+       unsigned char idle_balance;
+       /* For active balancing */
+       int post_schedule;
+       int active_balance;
+       int push_cpu;
+       struct cpu_stop_work active_balance_work;
+       /* cpu of this runqueue: */
+       int cpu;
+       int online;
+
+       u64 rt_avg;
+       u64 age_stamp;
+       u64 idle_stamp;
+       u64 avg_idle;
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+       u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+       u64 prev_steal_time_rq;
+#endif
+
+       /* calc_load related fields */
+       unsigned long calc_load_update;
+       long calc_load_active;
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+       int hrtick_csd_pending;
+       struct call_single_data hrtick_csd;
+#endif
+       struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+       /* latency stats */
+       struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+       /* sys_sched_yield() stats */
+       unsigned int yld_count;
+
+       /* schedule() stats */
+       unsigned int sched_switch;
+       unsigned int sched_count;
+       unsigned int sched_goidle;
+
+       /* try_to_wake_up() stats */
+       unsigned int ttwu_count;
+       unsigned int ttwu_local;
+#endif
+
+#ifdef CONFIG_SMP
+       struct llist_head wake_list;
+#endif
+};
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       return rq->cpu;
+#else
+       return 0;
+#endif
+}
+
+DECLARE_PER_CPU(struct rq, runqueues);
+
+#define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
+#define this_rq()              (&__get_cpu_var(runqueues))
+#define task_rq(p)             cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)          (cpu_rq(cpu)->curr)
+#define raw_rq()               (&__raw_get_cpu_var(runqueues))
+
+#ifdef CONFIG_SMP
+
+#define rcu_dereference_check_sched_domain(p) \
+       rcu_dereference_check((p), \
+                             lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+       for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+                       __sd; __sd = __sd->parent)
+
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu:       The cpu whose highest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the highest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd, *hsd = NULL;
+
+       for_each_domain(cpu, sd) {
+               if (!(sd->flags & flag))
+                       break;
+               hsd = sd;
+       }
+
+       return hsd;
+}
+
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_id);
+
+#endif /* CONFIG_SMP */
+
+#include "stats.h"
+#include "auto_group.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       struct task_group *tg;
+       struct cgroup_subsys_state *css;
+
+       css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                       lockdep_is_held(&p->pi_lock) ||
+                       lockdep_is_held(&task_rq(p)->lock));
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+       struct task_group *tg = task_group(p);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       p->se.cfs_rq = tg->cfs_rq[cpu];
+       p->se.parent = tg->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+       p->rt.rt_rq  = tg->rt_rq[cpu];
+       p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+       return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+       set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+       /*
+        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+        * successfuly executed on another CPU. We must ensure that updates of
+        * per-task data have been completed by this moment.
+        */
+       smp_wmb();
+       task_thread_info(p)->cpu = cpu;
+#endif
+}
+
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/jump_label.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+
+extern const_debug unsigned int sysctl_sched_features;
+
+#define SCHED_FEAT(name, enabled)      \
+       __SCHED_FEAT_##name ,
+
+enum {
+#include "features.h"
+       __SCHED_FEAT_NR,
+};
+
+#undef SCHED_FEAT
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+static __always_inline bool static_branch__true(struct jump_label_key *key)
+{
+       return likely(static_branch(key)); /* Not out of line branch. */
+}
+
+static __always_inline bool static_branch__false(struct jump_label_key *key)
+{
+       return unlikely(static_branch(key)); /* Out of line branch. */
+}
+
+#define SCHED_FEAT(name, enabled)                                      \
+static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+{                                                                      \
+       return static_branch__##enabled(key);                           \
+}
+
+#include "features.h"
+
+#undef SCHED_FEAT
+
+extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+
+static inline u64 global_rt_period(void)
+{
+       return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+       if (sysctl_sched_rt_runtime < 0)
+               return RUNTIME_INF;
+
+       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+
+
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+       return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+       return p->on_cpu;
+#else
+       return task_current(rq, p);
+#endif
+}
+
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)     do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)      do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->on_cpu = 0;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
+       /*
+        * If we are tracking spinlock dependencies then we have to
+        * fix up the runqueue lock - which gets 'carried over' from
+        * prev into current:
+        */
+       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+       raw_spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+       /*
+        * We can optimise this out completely for !SMP, because the
+        * SMP rebalancing from interrupt is the only thing that cares
+        * here.
+        */
+       next->on_cpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       raw_spin_unlock_irq(&rq->lock);
+#else
+       raw_spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+       /*
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
+        */
+       smp_wmb();
+       prev->on_cpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+       lw->weight += inc;
+       lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+       lw->weight -= dec;
+       lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+
+       CPUACCT_STAT_NSTATS,
+};
+
+
+#define sched_class_highest (&stop_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
+
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+
+
+#ifdef CONFIG_SMP
+
+extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void idle_balance(int this_cpu, struct rq *this_rq);
+
+#else  /* CONFIG_SMP */
+
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+
+#endif
+
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+extern void update_group_power(struct sched_domain *sd, int cpu);
+extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+
+extern void resched_task(struct task_struct *p);
+extern void resched_cpu(int cpu);
+
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+
+extern void update_cpu_load(struct rq *this_rq);
+
+#ifdef CONFIG_CGROUP_CPUACCT
+#include <linux/cgroup.h>
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+       struct cgroup_subsys_state css;
+       /* cpuusage holds pointer to a u64-type object on every cpu */
+       u64 __percpu *cpuusage;
+       struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+       return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+                           struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+       return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+                           struct cpuacct, css);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+       if (!ca || !ca->css.cgroup->parent)
+               return NULL;
+       return cgroup_ca(ca->css.cgroup->parent);
+}
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+#endif
+
+static inline void inc_nr_running(struct rq *rq)
+{
+       rq->nr_running++;
+}
+
+static inline void dec_nr_running(struct rq *rq)
+{
+       rq->nr_running--;
+}
+
+extern void update_rq_clock(struct rq *rq);
+
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+
+extern const_debug unsigned int sysctl_sched_time_avg;
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+
+static inline u64 sched_avg_period(void)
+{
+       return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+void calc_load_account_idle(struct rq *this_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+       if (!sched_feat(HRTICK))
+               return 0;
+       if (!cpu_active(cpu_of(rq)))
+               return 0;
+       return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+void hrtick_start(struct rq *rq, u64 delay);
+
+#else
+
+static inline int hrtick_enabled(struct rq *rq)
+{
+       return 0;
+}
+
+#endif /* CONFIG_SCHED_HRTICK */
+
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+       rq->rt_avg += rt_delta;
+       sched_avg_update(rq);
+}
+#else
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
+static inline void sched_avg_update(struct rq *rq) { }
+#endif
+
+extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       raw_spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+
+       return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       raw_spin_unlock(&this_rq->lock);
+                       raw_spin_lock(&busiest->lock);
+                       raw_spin_lock_nested(&this_rq->lock,
+                                             SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       raw_spin_lock_nested(&busiest->lock,
+                                             SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               raw_spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+
+       return _double_lock_balance(this_rq, busiest);
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       raw_spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
+{
+       BUG_ON(!irqs_disabled());
+       if (rq1 == rq2) {
+               raw_spin_lock(&rq1->lock);
+               __acquire(rq2->lock);   /* Fake it out ;) */
+       } else {
+               if (rq1 < rq2) {
+                       raw_spin_lock(&rq1->lock);
+                       raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+               } else {
+                       raw_spin_lock(&rq2->lock);
+                       raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+               }
+       }
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       raw_spin_unlock(&rq1->lock);
+       if (rq1 != rq2)
+               raw_spin_unlock(&rq2->lock);
+       else
+               __release(rq2->lock);
+}
+
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
+{
+       BUG_ON(!irqs_disabled());
+       BUG_ON(rq1 != rq2);
+       raw_spin_lock(&rq1->lock);
+       __acquire(rq2->lock);   /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       BUG_ON(rq1 != rq2);
+       raw_spin_unlock(&rq1->lock);
+       __release(rq2->lock);
+}
+
+#endif
+
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+
+extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+
+#ifdef CONFIG_NO_HZ
+enum rq_nohz_flag_bits {
+       NOHZ_TICK_STOPPED,
+       NOHZ_BALANCE_KICK,
+       NOHZ_IDLE,
+};
+
+#define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644 (file)
index 0000000..2a581ba
--- /dev/null
@@ -0,0 +1,111 @@
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "sched.h"
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+       int cpu;
+       int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+       char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+
+       if (mask_str == NULL)
+               return -ENOMEM;
+
+       seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+       seq_printf(seq, "timestamp %lu\n", jiffies);
+       for_each_online_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+               struct sched_domain *sd;
+               int dcount = 0;
+#endif
+
+               /* runqueue-specific stats */
+               seq_printf(seq,
+                   "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+                   cpu, rq->yld_count,
+                   rq->sched_switch, rq->sched_count, rq->sched_goidle,
+                   rq->ttwu_count, rq->ttwu_local,
+                   rq->rq_cpu_time,
+                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+               seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+               /* domain-specific stats */
+               rcu_read_lock();
+               for_each_domain(cpu, sd) {
+                       enum cpu_idle_type itype;
+
+                       cpumask_scnprintf(mask_str, mask_len,
+                                         sched_domain_span(sd));
+                       seq_printf(seq, "domain%d %s", dcount++, mask_str);
+                       for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+                                       itype++) {
+                               seq_printf(seq, " %u %u %u %u %u %u %u %u",
+                                   sd->lb_count[itype],
+                                   sd->lb_balanced[itype],
+                                   sd->lb_failed[itype],
+                                   sd->lb_imbalance[itype],
+                                   sd->lb_gained[itype],
+                                   sd->lb_hot_gained[itype],
+                                   sd->lb_nobusyq[itype],
+                                   sd->lb_nobusyg[itype]);
+                       }
+                       seq_printf(seq,
+                                  " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+                           sd->alb_count, sd->alb_failed, sd->alb_pushed,
+                           sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+                           sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                           sd->ttwu_move_balance);
+               }
+               rcu_read_unlock();
+#endif
+       }
+       kfree(mask_str);
+       return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+       unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+       char *buf = kmalloc(size, GFP_KERNEL);
+       struct seq_file *m;
+       int res;
+
+       if (!buf)
+               return -ENOMEM;
+       res = single_open(file, show_schedstat, NULL);
+       if (!res) {
+               m = file->private_data;
+               m->buf = buf;
+               m->size = size;
+       } else
+               kfree(buf);
+       return res;
+}
+
+static const struct file_operations proc_schedstat_operations = {
+       .open    = schedstat_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = single_release,
+};
+
+static int __init proc_schedstat_init(void)
+{
+       proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+       return 0;
+}
+module_init(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
new file mode 100644 (file)
index 0000000..2ef90a5
--- /dev/null
@@ -0,0 +1,231 @@
+
+#ifdef CONFIG_SCHEDSTATS
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{
+       if (rq) {
+               rq->rq_sched_info.run_delay += delta;
+               rq->rq_sched_info.pcount++;
+       }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{
+       if (rq)
+               rq->rq_cpu_time += delta;
+}
+
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{
+       if (rq)
+               rq->rq_sched_info.run_delay += delta;
+}
+# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
+# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val)       do { var = (val); } while (0)
+#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+{}
+# define schedstat_inc(rq, field)      do { } while (0)
+# define schedstat_add(rq, field, amt) do { } while (0)
+# define schedstat_set(var, val)       do { } while (0)
+#endif
+
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+static inline void sched_info_reset_dequeued(struct task_struct *t)
+{
+       t->sched_info.last_queued = 0;
+}
+
+/*
+ * We are interested in knowing how long it was from the *first* time a
+ * task was queued to the time that it finally hit a cpu, we call this routine
+ * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * delta taken on each cpu would annul the skew.
+ */
+static inline void sched_info_dequeued(struct task_struct *t)
+{
+       unsigned long long now = task_rq(t)->clock, delta = 0;
+
+       if (unlikely(sched_info_on()))
+               if (t->sched_info.last_queued)
+                       delta = now - t->sched_info.last_queued;
+       sched_info_reset_dequeued(t);
+       t->sched_info.run_delay += delta;
+
+       rq_sched_info_dequeued(task_rq(t), delta);
+}
+
+/*
+ * Called when a task finally hits the cpu.  We can now calculate how
+ * long it was waiting to run.  We also note when it began so that we
+ * can keep stats on how long its timeslice is.
+ */
+static void sched_info_arrive(struct task_struct *t)
+{
+       unsigned long long now = task_rq(t)->clock, delta = 0;
+
+       if (t->sched_info.last_queued)
+               delta = now - t->sched_info.last_queued;
+       sched_info_reset_dequeued(t);
+       t->sched_info.run_delay += delta;
+       t->sched_info.last_arrival = now;
+       t->sched_info.pcount++;
+
+       rq_sched_info_arrive(task_rq(t), delta);
+}
+
+/*
+ * This function is only called from enqueue_task(), but also only updates
+ * the timestamp if it is already not set.  It's assumed that
+ * sched_info_dequeued() will clear that stamp when appropriate.
+ */
+static inline void sched_info_queued(struct task_struct *t)
+{
+       if (unlikely(sched_info_on()))
+               if (!t->sched_info.last_queued)
+                       t->sched_info.last_queued = task_rq(t)->clock;
+}
+
+/*
+ * Called when a process ceases being the active-running process, either
+ * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
+ */
+static inline void sched_info_depart(struct task_struct *t)
+{
+       unsigned long long delta = task_rq(t)->clock -
+                                       t->sched_info.last_arrival;
+
+       rq_sched_info_depart(task_rq(t), delta);
+
+       if (t->state == TASK_RUNNING)
+               sched_info_queued(t);
+}
+
+/*
+ * Called when tasks are switched involuntarily due, typically, to expiring
+ * their time slice.  (This may also be called when switching to or from
+ * the idle task.)  We are only called when prev != next.
+ */
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+       struct rq *rq = task_rq(prev);
+
+       /*
+        * prev now departs the cpu.  It's not interesting to record
+        * stats about how efficient we were at scheduling the idle
+        * process, however.
+        */
+       if (prev != rq->idle)
+               sched_info_depart(prev);
+
+       if (next != rq->idle)
+               sched_info_arrive(next);
+}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+       if (unlikely(sched_info_on()))
+               __sched_info_switch(prev, next);
+}
+#else
+#define sched_info_queued(t)                   do { } while (0)
+#define sched_info_reset_dequeued(t)   do { } while (0)
+#define sched_info_dequeued(t)                 do { } while (0)
+#define sched_info_switch(t, next)             do { } while (0)
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+
+/*
+ * The following are functions that support scheduler-internal time accounting.
+ * These functions are generally called at the timer tick.  None of this depends
+ * on CONFIG_SCHEDSTATS.
+ */
+
+/**
+ * account_group_user_time - Maintain utime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @cputime:   Time value by which to increment the utime field of the
+ *             thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the utime field there.
+ */
+static inline void account_group_user_time(struct task_struct *tsk,
+                                          cputime_t cputime)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       raw_spin_lock(&cputimer->lock);
+       cputimer->cputime.utime += cputime;
+       raw_spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_system_time - Maintain stime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @cputime:   Time value by which to increment the stime field of the
+ *             thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the stime field there.
+ */
+static inline void account_group_system_time(struct task_struct *tsk,
+                                            cputime_t cputime)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       raw_spin_lock(&cputimer->lock);
+       cputimer->cputime.stime += cputime;
+       raw_spin_unlock(&cputimer->lock);
+}
+
+/**
+ * account_group_exec_runtime - Maintain exec runtime for a thread group.
+ *
+ * @tsk:       Pointer to task structure.
+ * @ns:                Time value by which to increment the sum_exec_runtime field
+ *             of the thread_group_cputime structure.
+ *
+ * If thread group time is being maintained, get the structure for the
+ * running CPU and update the sum_exec_runtime field there.
+ */
+static inline void account_group_exec_runtime(struct task_struct *tsk,
+                                             unsigned long long ns)
+{
+       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+
+       if (!cputimer->running)
+               return;
+
+       raw_spin_lock(&cputimer->lock);
+       cputimer->cputime.sum_exec_runtime += ns;
+       raw_spin_unlock(&cputimer->lock);
+}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
new file mode 100644 (file)
index 0000000..7b386e8
--- /dev/null
@@ -0,0 +1,108 @@
+#include "sched.h"
+
+/*
+ * stop-task scheduling class.
+ *
+ * The stop task is the highest priority task in the system, it preempts
+ * everything and will be preempted by nothing.
+ *
+ * See kernel/stop_machine.c
+ */
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+{
+       return task_cpu(p); /* stop tasks as never migrate */
+}
+#endif /* CONFIG_SMP */
+
+static void
+check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+       /* we're never preempted */
+}
+
+static struct task_struct *pick_next_task_stop(struct rq *rq)
+{
+       struct task_struct *stop = rq->stop;
+
+       if (stop && stop->on_rq)
+               return stop;
+
+       return NULL;
+}
+
+static void
+enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+       inc_nr_running(rq);
+}
+
+static void
+dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+{
+       dec_nr_running(rq);
+}
+
+static void yield_task_stop(struct rq *rq)
+{
+       BUG(); /* the stop task should never yield, its pointless. */
+}
+
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+
+static void set_curr_task_stop(struct rq *rq)
+{
+}
+
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
+{
+       BUG(); /* its impossible to change to this class */
+}
+
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
+{
+       BUG(); /* how!?, what priority? */
+}
+
+static unsigned int
+get_rr_interval_stop(struct rq *rq, struct task_struct *task)
+{
+       return 0;
+}
+
+/*
+ * Simple, special scheduling class for the per-CPU stop tasks:
+ */
+const struct sched_class stop_sched_class = {
+       .next                   = &rt_sched_class,
+
+       .enqueue_task           = enqueue_task_stop,
+       .dequeue_task           = dequeue_task_stop,
+       .yield_task             = yield_task_stop,
+
+       .check_preempt_curr     = check_preempt_curr_stop,
+
+       .pick_next_task         = pick_next_task_stop,
+       .put_prev_task          = put_prev_task_stop,
+
+#ifdef CONFIG_SMP
+       .select_task_rq         = select_task_rq_stop,
+#endif
+
+       .set_curr_task          = set_curr_task_stop,
+       .task_tick              = task_tick_stop,
+
+       .get_rr_interval        = get_rr_interval_stop,
+
+       .prio_changed           = prio_changed_stop,
+       .switched_to            = switched_to_stop,
+};
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
deleted file mode 100644 (file)
index 429242f..0000000
+++ /dev/null
@@ -1,275 +0,0 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-
-unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
-static struct autogroup autogroup_default;
-static atomic_t autogroup_seq_nr;
-
-static void __init autogroup_init(struct task_struct *init_task)
-{
-       autogroup_default.tg = &root_task_group;
-       kref_init(&autogroup_default.kref);
-       init_rwsem(&autogroup_default.lock);
-       init_task->signal->autogroup = &autogroup_default;
-}
-
-static inline void autogroup_free(struct task_group *tg)
-{
-       kfree(tg->autogroup);
-}
-
-static inline void autogroup_destroy(struct kref *kref)
-{
-       struct autogroup *ag = container_of(kref, struct autogroup, kref);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-       /* We've redirected RT tasks to the root task group... */
-       ag->tg->rt_se = NULL;
-       ag->tg->rt_rq = NULL;
-#endif
-       sched_destroy_group(ag->tg);
-}
-
-static inline void autogroup_kref_put(struct autogroup *ag)
-{
-       kref_put(&ag->kref, autogroup_destroy);
-}
-
-static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
-{
-       kref_get(&ag->kref);
-       return ag;
-}
-
-static inline struct autogroup *autogroup_task_get(struct task_struct *p)
-{
-       struct autogroup *ag;
-       unsigned long flags;
-
-       if (!lock_task_sighand(p, &flags))
-               return autogroup_kref_get(&autogroup_default);
-
-       ag = autogroup_kref_get(p->signal->autogroup);
-       unlock_task_sighand(p, &flags);
-
-       return ag;
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg);
-#endif
-
-static inline struct autogroup *autogroup_create(void)
-{
-       struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
-       struct task_group *tg;
-
-       if (!ag)
-               goto out_fail;
-
-       tg = sched_create_group(&root_task_group);
-
-       if (IS_ERR(tg))
-               goto out_free;
-
-       kref_init(&ag->kref);
-       init_rwsem(&ag->lock);
-       ag->id = atomic_inc_return(&autogroup_seq_nr);
-       ag->tg = tg;
-#ifdef CONFIG_RT_GROUP_SCHED
-       /*
-        * Autogroup RT tasks are redirected to the root task group
-        * so we don't have to move tasks around upon policy change,
-        * or flail around trying to allocate bandwidth on the fly.
-        * A bandwidth exception in __sched_setscheduler() allows
-        * the policy change to proceed.  Thereafter, task_group()
-        * returns &root_task_group, so zero bandwidth is required.
-        */
-       free_rt_sched_group(tg);
-       tg->rt_se = root_task_group.rt_se;
-       tg->rt_rq = root_task_group.rt_rq;
-#endif
-       tg->autogroup = ag;
-
-       return ag;
-
-out_free:
-       kfree(ag);
-out_fail:
-       if (printk_ratelimit()) {
-               printk(KERN_WARNING "autogroup_create: %s failure.\n",
-                       ag ? "sched_create_group()" : "kmalloc()");
-       }
-
-       return autogroup_kref_get(&autogroup_default);
-}
-
-static inline bool
-task_wants_autogroup(struct task_struct *p, struct task_group *tg)
-{
-       if (tg != &root_task_group)
-               return false;
-
-       if (p->sched_class != &fair_sched_class)
-               return false;
-
-       /*
-        * We can only assume the task group can't go away on us if
-        * autogroup_move_group() can see us on ->thread_group list.
-        */
-       if (p->flags & PF_EXITING)
-               return false;
-
-       return true;
-}
-
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-       return !!tg->autogroup;
-}
-
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
-       if (enabled && task_wants_autogroup(p, tg))
-               return p->signal->autogroup->tg;
-
-       return tg;
-}
-
-static void
-autogroup_move_group(struct task_struct *p, struct autogroup *ag)
-{
-       struct autogroup *prev;
-       struct task_struct *t;
-       unsigned long flags;
-
-       BUG_ON(!lock_task_sighand(p, &flags));
-
-       prev = p->signal->autogroup;
-       if (prev == ag) {
-               unlock_task_sighand(p, &flags);
-               return;
-       }
-
-       p->signal->autogroup = autogroup_kref_get(ag);
-
-       if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
-               goto out;
-
-       t = p;
-       do {
-               sched_move_task(t);
-       } while_each_thread(p, t);
-
-out:
-       unlock_task_sighand(p, &flags);
-       autogroup_kref_put(prev);
-}
-
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
-void sched_autogroup_create_attach(struct task_struct *p)
-{
-       struct autogroup *ag = autogroup_create();
-
-       autogroup_move_group(p, ag);
-       /* drop extra reference added by autogroup_create() */
-       autogroup_kref_put(ag);
-}
-EXPORT_SYMBOL(sched_autogroup_create_attach);
-
-/* Cannot be called under siglock.  Currently has no users */
-void sched_autogroup_detach(struct task_struct *p)
-{
-       autogroup_move_group(p, &autogroup_default);
-}
-EXPORT_SYMBOL(sched_autogroup_detach);
-
-void sched_autogroup_fork(struct signal_struct *sig)
-{
-       sig->autogroup = autogroup_task_get(current);
-}
-
-void sched_autogroup_exit(struct signal_struct *sig)
-{
-       autogroup_kref_put(sig->autogroup);
-}
-
-static int __init setup_autogroup(char *str)
-{
-       sysctl_sched_autogroup_enabled = 0;
-
-       return 1;
-}
-
-__setup("noautogroup", setup_autogroup);
-
-#ifdef CONFIG_PROC_FS
-
-int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
-{
-       static unsigned long next = INITIAL_JIFFIES;
-       struct autogroup *ag;
-       int err;
-
-       if (*nice < -20 || *nice > 19)
-               return -EINVAL;
-
-       err = security_task_setnice(current, *nice);
-       if (err)
-               return err;
-
-       if (*nice < 0 && !can_nice(current, *nice))
-               return -EPERM;
-
-       /* this is a heavy operation taking global locks.. */
-       if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
-               return -EAGAIN;
-
-       next = HZ / 10 + jiffies;
-       ag = autogroup_task_get(p);
-
-       down_write(&ag->lock);
-       err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
-       if (!err)
-               ag->nice = *nice;
-       up_write(&ag->lock);
-
-       autogroup_kref_put(ag);
-
-       return err;
-}
-
-void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
-{
-       struct autogroup *ag = autogroup_task_get(p);
-
-       if (!task_group_is_autogroup(ag->tg))
-               goto out;
-
-       down_read(&ag->lock);
-       seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
-       up_read(&ag->lock);
-
-out:
-       autogroup_kref_put(ag);
-}
-#endif /* CONFIG_PROC_FS */
-
-#ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-       if (!task_group_is_autogroup(tg))
-               return 0;
-
-       return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
-}
-#endif /* CONFIG_SCHED_DEBUG */
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
deleted file mode 100644 (file)
index c2f0e72..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifdef CONFIG_SCHED_AUTOGROUP
-
-struct autogroup {
-       /*
-        * reference doesn't mean how many thread attach to this
-        * autogroup now. It just stands for the number of task
-        * could use this autogroup.
-        */
-       struct kref             kref;
-       struct task_group       *tg;
-       struct rw_semaphore     lock;
-       unsigned long           id;
-       int                     nice;
-};
-
-static inline bool task_group_is_autogroup(struct task_group *tg);
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg);
-
-#else /* !CONFIG_SCHED_AUTOGROUP */
-
-static inline void autogroup_init(struct task_struct *init_task) {  }
-static inline void autogroup_free(struct task_group *tg) { }
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-       return 0;
-}
-
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-       return tg;
-}
-
-#ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
-{
-       return 0;
-}
-#endif
-
-#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
deleted file mode 100644 (file)
index c685e31..0000000
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * sched_clock for unstable cpu clocks
- *
- *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- *
- *  Updates and enhancements:
- *    Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
- *
- * Based on code by:
- *   Ingo Molnar <mingo@redhat.com>
- *   Guillaume Chazarain <guichaz@gmail.com>
- *
- *
- * What:
- *
- * cpu_clock(i) provides a fast (execution time) high resolution
- * clock with bounded drift between CPUs. The value of cpu_clock(i)
- * is monotonic for constant i. The timestamp returned is in nanoseconds.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- *
- * There is no strict promise about the base, although it tends to start
- * at 0 on boot (but people really shouldn't rely on that).
- *
- * cpu_clock(i)       -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
- * local_clock()      -- is cpu_clock() on the current cpu.
- *
- * How:
- *
- * The implementation either uses sched_clock() when
- * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
- * sched_clock() is assumed to provide these properties (mostly it means
- * the architecture provides a globally synchronized highres time source).
- *
- * Otherwise it tries to create a semi stable clock from a mixture of other
- * clocks, including:
- *
- *  - GTOD (clock monotomic)
- *  - sched_clock()
- *  - explicit idle events
- *
- * We use GTOD as base and use sched_clock() deltas to improve resolution. The
- * deltas are filtered to provide monotonicity and keeping it within an
- * expected window.
- *
- * Furthermore, explicit sleep and wakeup hooks allow us to account for time
- * that is otherwise invisible (TSC gets stopped).
- *
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
- */
-#include <linux/spinlock.h>
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
-       return (unsigned long long)(jiffies - INITIAL_JIFFIES)
-                                       * (NSEC_PER_SEC / HZ);
-}
-EXPORT_SYMBOL_GPL(sched_clock);
-
-__read_mostly int sched_clock_running;
-
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
-
-struct sched_clock_data {
-       u64                     tick_raw;
-       u64                     tick_gtod;
-       u64                     clock;
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-
-static inline struct sched_clock_data *this_scd(void)
-{
-       return &__get_cpu_var(sched_clock_data);
-}
-
-static inline struct sched_clock_data *cpu_sdc(int cpu)
-{
-       return &per_cpu(sched_clock_data, cpu);
-}
-
-void sched_clock_init(void)
-{
-       u64 ktime_now = ktime_to_ns(ktime_get());
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct sched_clock_data *scd = cpu_sdc(cpu);
-
-               scd->tick_raw = 0;
-               scd->tick_gtod = ktime_now;
-               scd->clock = ktime_now;
-       }
-
-       sched_clock_running = 1;
-}
-
-/*
- * min, max except they take wrapping into account
- */
-
-static inline u64 wrap_min(u64 x, u64 y)
-{
-       return (s64)(x - y) < 0 ? x : y;
-}
-
-static inline u64 wrap_max(u64 x, u64 y)
-{
-       return (s64)(x - y) > 0 ? x : y;
-}
-
-/*
- * update the percpu scd from the raw @now value
- *
- *  - filter out backward motion
- *  - use the GTOD tick value to create a window to filter crazy TSC values
- */
-static u64 sched_clock_local(struct sched_clock_data *scd)
-{
-       u64 now, clock, old_clock, min_clock, max_clock;
-       s64 delta;
-
-again:
-       now = sched_clock();
-       delta = now - scd->tick_raw;
-       if (unlikely(delta < 0))
-               delta = 0;
-
-       old_clock = scd->clock;
-
-       /*
-        * scd->clock = clamp(scd->tick_gtod + delta,
-        *                    max(scd->tick_gtod, scd->clock),
-        *                    scd->tick_gtod + TICK_NSEC);
-        */
-
-       clock = scd->tick_gtod + delta;
-       min_clock = wrap_max(scd->tick_gtod, old_clock);
-       max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
-
-       clock = wrap_max(clock, min_clock);
-       clock = wrap_min(clock, max_clock);
-
-       if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
-               goto again;
-
-       return clock;
-}
-
-static u64 sched_clock_remote(struct sched_clock_data *scd)
-{
-       struct sched_clock_data *my_scd = this_scd();
-       u64 this_clock, remote_clock;
-       u64 *ptr, old_val, val;
-
-       sched_clock_local(my_scd);
-again:
-       this_clock = my_scd->clock;
-       remote_clock = scd->clock;
-
-       /*
-        * Use the opportunity that we have both locks
-        * taken to couple the two clocks: we take the
-        * larger time as the latest time for both
-        * runqueues. (this creates monotonic movement)
-        */
-       if (likely((s64)(remote_clock - this_clock) < 0)) {
-               ptr = &scd->clock;
-               old_val = remote_clock;
-               val = this_clock;
-       } else {
-               /*
-                * Should be rare, but possible:
-                */
-               ptr = &my_scd->clock;
-               old_val = this_clock;
-               val = remote_clock;
-       }
-
-       if (cmpxchg64(ptr, old_val, val) != old_val)
-               goto again;
-
-       return val;
-}
-
-/*
- * Similar to cpu_clock(), but requires local IRQs to be disabled.
- *
- * See cpu_clock().
- */
-u64 sched_clock_cpu(int cpu)
-{
-       struct sched_clock_data *scd;
-       u64 clock;
-
-       WARN_ON_ONCE(!irqs_disabled());
-
-       if (sched_clock_stable)
-               return sched_clock();
-
-       if (unlikely(!sched_clock_running))
-               return 0ull;
-
-       scd = cpu_sdc(cpu);
-
-       if (cpu != smp_processor_id())
-               clock = sched_clock_remote(scd);
-       else
-               clock = sched_clock_local(scd);
-
-       return clock;
-}
-
-void sched_clock_tick(void)
-{
-       struct sched_clock_data *scd;
-       u64 now, now_gtod;
-
-       if (sched_clock_stable)
-               return;
-
-       if (unlikely(!sched_clock_running))
-               return;
-
-       WARN_ON_ONCE(!irqs_disabled());
-
-       scd = this_scd();
-       now_gtod = ktime_to_ns(ktime_get());
-       now = sched_clock();
-
-       scd->tick_raw = now;
-       scd->tick_gtod = now_gtod;
-       sched_clock_local(scd);
-}
-
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
-       sched_clock_cpu(smp_processor_id());
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
-       if (timekeeping_suspended)
-               return;
-
-       sched_clock_tick();
-       touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
-       u64 clock;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       clock = sched_clock_cpu(cpu);
-       local_irq_restore(flags);
-
-       return clock;
-}
-
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
-       u64 clock;
-       unsigned long flags;
-
-       local_irq_save(flags);
-       clock = sched_clock_cpu(smp_processor_id());
-       local_irq_restore(flags);
-
-       return clock;
-}
-
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-void sched_clock_init(void)
-{
-       sched_clock_running = 1;
-}
-
-u64 sched_clock_cpu(int cpu)
-{
-       if (unlikely(!sched_clock_running))
-               return 0;
-
-       return sched_clock();
-}
-
-u64 cpu_clock(int cpu)
-{
-       return sched_clock_cpu(cpu);
-}
-
-u64 local_clock(void)
-{
-       return sched_clock_cpu(0);
-}
-
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
deleted file mode 100644 (file)
index a86cf9d..0000000
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- *  kernel/sched_cpupri.c
- *
- *  CPU priority management
- *
- *  Copyright (C) 2007-2008 Novell
- *
- *  Author: Gregory Haskins <ghaskins@novell.com>
- *
- *  This code tracks the priority of each CPU so that global migration
- *  decisions are easy to calculate.  Each CPU can be in a state as follows:
- *
- *                 (INVALID), IDLE, NORMAL, RT1, ... RT99
- *
- *  going from the lowest priority to the highest.  CPUs in the INVALID state
- *  are not eligible for routing.  The system maintains this state with
- *  a 2 dimensional bitmap (the first for priority class, the second for cpus
- *  in that class).  Therefore a typical application without affinity
- *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
- *  searches).  For tasks with affinity restrictions, the algorithm has a
- *  worst case complexity of O(min(102, nr_domcpus)), though the scenario that
- *  yields the worst case search is fairly contrived.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; version 2
- *  of the License.
- */
-
-#include <linux/gfp.h>
-#include "sched_cpupri.h"
-
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
-static int convert_prio(int prio)
-{
-       int cpupri;
-
-       if (prio == CPUPRI_INVALID)
-               cpupri = CPUPRI_INVALID;
-       else if (prio == MAX_PRIO)
-               cpupri = CPUPRI_IDLE;
-       else if (prio >= MAX_RT_PRIO)
-               cpupri = CPUPRI_NORMAL;
-       else
-               cpupri = MAX_RT_PRIO - prio + 1;
-
-       return cpupri;
-}
-
-/**
- * cpupri_find - find the best (lowest-pri) CPU in the system
- * @cp: The cpupri context
- * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
- *
- * Note: This function returns the recommended CPUs as calculated during the
- * current invocation.  By the time the call returns, the CPUs may have in
- * fact changed priorities any number of times.  While not ideal, it is not
- * an issue of correctness since the normal rebalancer logic will correct
- * any discrepancies created by racing against the uncertainty of the current
- * priority configuration.
- *
- * Returns: (int)bool - CPUs were found
- */
-int cpupri_find(struct cpupri *cp, struct task_struct *p,
-               struct cpumask *lowest_mask)
-{
-       int                  idx      = 0;
-       int                  task_pri = convert_prio(p->prio);
-
-       if (task_pri >= MAX_RT_PRIO)
-               return 0;
-
-       for (idx = 0; idx < task_pri; idx++) {
-               struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-               int skip = 0;
-
-               if (!atomic_read(&(vec)->count))
-                       skip = 1;
-               /*
-                * When looking at the vector, we need to read the counter,
-                * do a memory barrier, then read the mask.
-                *
-                * Note: This is still all racey, but we can deal with it.
-                *  Ideally, we only want to look at masks that are set.
-                *
-                *  If a mask is not set, then the only thing wrong is that we
-                *  did a little more work than necessary.
-                *
-                *  If we read a zero count but the mask is set, because of the
-                *  memory barriers, that can only happen when the highest prio
-                *  task for a run queue has left the run queue, in which case,
-                *  it will be followed by a pull. If the task we are processing
-                *  fails to find a proper place to go, that pull request will
-                *  pull this task if the run queue is running at a lower
-                *  priority.
-                */
-               smp_rmb();
-
-               /* Need to do the rmb for every iteration */
-               if (skip)
-                       continue;
-
-               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
-                       continue;
-
-               if (lowest_mask) {
-                       cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
-                       /*
-                        * We have to ensure that we have at least one bit
-                        * still set in the array, since the map could have
-                        * been concurrently emptied between the first and
-                        * second reads of vec->mask.  If we hit this
-                        * condition, simply act as though we never hit this
-                        * priority level and continue on.
-                        */
-                       if (cpumask_any(lowest_mask) >= nr_cpu_ids)
-                               continue;
-               }
-
-               return 1;
-       }
-
-       return 0;
-}
-
-/**
- * cpupri_set - update the cpu priority setting
- * @cp: The cpupri context
- * @cpu: The target cpu
- * @pri: The priority (INVALID-RT99) to assign to this CPU
- *
- * Note: Assumes cpu_rq(cpu)->lock is locked
- *
- * Returns: (void)
- */
-void cpupri_set(struct cpupri *cp, int cpu, int newpri)
-{
-       int                 *currpri = &cp->cpu_to_pri[cpu];
-       int                  oldpri  = *currpri;
-       int                  do_mb = 0;
-
-       newpri = convert_prio(newpri);
-
-       BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
-
-       if (newpri == oldpri)
-               return;
-
-       /*
-        * If the cpu was currently mapped to a different value, we
-        * need to map it to the new value then remove the old value.
-        * Note, we must add the new value first, otherwise we risk the
-        * cpu being missed by the priority loop in cpupri_find.
-        */
-       if (likely(newpri != CPUPRI_INVALID)) {
-               struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-
-               cpumask_set_cpu(cpu, vec->mask);
-               /*
-                * When adding a new vector, we update the mask first,
-                * do a write memory barrier, and then update the count, to
-                * make sure the vector is visible when count is set.
-                */
-               smp_mb__before_atomic_inc();
-               atomic_inc(&(vec)->count);
-               do_mb = 1;
-       }
-       if (likely(oldpri != CPUPRI_INVALID)) {
-               struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-
-               /*
-                * Because the order of modification of the vec->count
-                * is important, we must make sure that the update
-                * of the new prio is seen before we decrement the
-                * old prio. This makes sure that the loop sees
-                * one or the other when we raise the priority of
-                * the run queue. We don't care about when we lower the
-                * priority, as that will trigger an rt pull anyway.
-                *
-                * We only need to do a memory barrier if we updated
-                * the new priority vec.
-                */
-               if (do_mb)
-                       smp_mb__after_atomic_inc();
-
-               /*
-                * When removing from the vector, we decrement the counter first
-                * do a memory barrier and then clear the mask.
-                */
-               atomic_dec(&(vec)->count);
-               smp_mb__after_atomic_inc();
-               cpumask_clear_cpu(cpu, vec->mask);
-       }
-
-       *currpri = newpri;
-}
-
-/**
- * cpupri_init - initialize the cpupri structure
- * @cp: The cpupri context
- * @bootmem: true if allocations need to use bootmem
- *
- * Returns: -ENOMEM if memory fails.
- */
-int cpupri_init(struct cpupri *cp)
-{
-       int i;
-
-       memset(cp, 0, sizeof(*cp));
-
-       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
-               struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-
-               atomic_set(&vec->count, 0);
-               if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
-                       goto cleanup;
-       }
-
-       for_each_possible_cpu(i)
-               cp->cpu_to_pri[i] = CPUPRI_INVALID;
-       return 0;
-
-cleanup:
-       for (i--; i >= 0; i--)
-               free_cpumask_var(cp->pri_to_cpu[i].mask);
-       return -ENOMEM;
-}
-
-/**
- * cpupri_cleanup - clean up the cpupri structure
- * @cp: The cpupri context
- */
-void cpupri_cleanup(struct cpupri *cp)
-{
-       int i;
-
-       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
-               free_cpumask_var(cp->pri_to_cpu[i].mask);
-}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
deleted file mode 100644 (file)
index f6d7561..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
-
-#include <linux/sched.h>
-
-#define CPUPRI_NR_PRIORITIES   (MAX_RT_PRIO + 2)
-
-#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE     0
-#define CPUPRI_NORMAL   1
-/* values 2-101 are RT priorities 0-99 */
-
-struct cpupri_vec {
-       atomic_t        count;
-       cpumask_var_t   mask;
-};
-
-struct cpupri {
-       struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-       int               cpu_to_pri[NR_CPUS];
-};
-
-#ifdef CONFIG_SMP
-int  cpupri_find(struct cpupri *cp,
-                struct task_struct *p, struct cpumask *lowest_mask);
-void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
-void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
-#endif
-
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
deleted file mode 100644 (file)
index a6710a1..0000000
+++ /dev/null
@@ -1,508 +0,0 @@
-/*
- * kernel/time/sched_debug.c
- *
- * Print the CFS rbtree
- *
- * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/proc_fs.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-
-static DEFINE_SPINLOCK(sched_debug_lock);
-
-/*
- * This allows printing both to /proc/sched_debug and
- * to the console
- */
-#define SEQ_printf(m, x...)                    \
- do {                                          \
-       if (m)                                  \
-               seq_printf(m, x);               \
-       else                                    \
-               printk(x);                      \
- } while (0)
-
-/*
- * Ease the printing of nsec fields:
- */
-static long long nsec_high(unsigned long long nsec)
-{
-       if ((long long)nsec < 0) {
-               nsec = -nsec;
-               do_div(nsec, 1000000);
-               return -nsec;
-       }
-       do_div(nsec, 1000000);
-
-       return nsec;
-}
-
-static unsigned long nsec_low(unsigned long long nsec)
-{
-       if ((long long)nsec < 0)
-               nsec = -nsec;
-
-       return do_div(nsec, 1000000);
-}
-
-#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
-{
-       struct sched_entity *se = tg->se[cpu];
-       if (!se)
-               return;
-
-#define P(F) \
-       SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
-#define PN(F) \
-       SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-
-       PN(se->exec_start);
-       PN(se->vruntime);
-       PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
-       PN(se->statistics.wait_start);
-       PN(se->statistics.sleep_start);
-       PN(se->statistics.block_start);
-       PN(se->statistics.sleep_max);
-       PN(se->statistics.block_max);
-       PN(se->statistics.exec_max);
-       PN(se->statistics.slice_max);
-       PN(se->statistics.wait_max);
-       PN(se->statistics.wait_sum);
-       P(se->statistics.wait_count);
-#endif
-       P(se->load.weight);
-#undef PN
-#undef P
-}
-#endif
-
-#ifdef CONFIG_CGROUP_SCHED
-static char group_path[PATH_MAX];
-
-static char *task_group_path(struct task_group *tg)
-{
-       if (autogroup_path(tg, group_path, PATH_MAX))
-               return group_path;
-
-       /*
-        * May be NULL if the underlying cgroup isn't fully-created yet
-        */
-       if (!tg->css.cgroup) {
-               group_path[0] = '\0';
-               return group_path;
-       }
-       cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
-       return group_path;
-}
-#endif
-
-static void
-print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
-{
-       if (rq->curr == p)
-               SEQ_printf(m, "R");
-       else
-               SEQ_printf(m, " ");
-
-       SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
-               p->comm, p->pid,
-               SPLIT_NS(p->se.vruntime),
-               (long long)(p->nvcsw + p->nivcsw),
-               p->prio);
-#ifdef CONFIG_SCHEDSTATS
-       SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-               SPLIT_NS(p->se.vruntime),
-               SPLIT_NS(p->se.sum_exec_runtime),
-               SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-#else
-       SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
-               0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
-#endif
-#ifdef CONFIG_CGROUP_SCHED
-       SEQ_printf(m, " %s", task_group_path(task_group(p)));
-#endif
-
-       SEQ_printf(m, "\n");
-}
-
-static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
-{
-       struct task_struct *g, *p;
-       unsigned long flags;
-
-       SEQ_printf(m,
-       "\nrunnable tasks:\n"
-       "            task   PID         tree-key  switches  prio"
-       "     exec-runtime         sum-exec        sum-sleep\n"
-       "------------------------------------------------------"
-       "----------------------------------------------------\n");
-
-       read_lock_irqsave(&tasklist_lock, flags);
-
-       do_each_thread(g, p) {
-               if (!p->on_rq || task_cpu(p) != rq_cpu)
-                       continue;
-
-               print_task(m, rq, p);
-       } while_each_thread(g, p);
-
-       read_unlock_irqrestore(&tasklist_lock, flags);
-}
-
-void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
-       s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
-               spread, rq0_min_vruntime, spread0;
-       struct rq *rq = cpu_rq(cpu);
-       struct sched_entity *last;
-       unsigned long flags;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-#else
-       SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
-                       SPLIT_NS(cfs_rq->exec_clock));
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       if (cfs_rq->rb_leftmost)
-               MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
-       last = __pick_last_entity(cfs_rq);
-       if (last)
-               max_vruntime = last->vruntime;
-       min_vruntime = cfs_rq->min_vruntime;
-       rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
-                       SPLIT_NS(MIN_vruntime));
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
-                       SPLIT_NS(min_vruntime));
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
-                       SPLIT_NS(max_vruntime));
-       spread = max_vruntime - MIN_vruntime;
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
-                       SPLIT_NS(spread));
-       spread0 = min_vruntime - rq0_min_vruntime;
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
-                       SPLIT_NS(spread0));
-       SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
-                       cfs_rq->nr_spread_over);
-       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
-                       SPLIT_NS(cfs_rq->load_avg));
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
-                       SPLIT_NS(cfs_rq->load_period));
-       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
-                       cfs_rq->load_contribution);
-       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-                       atomic_read(&cfs_rq->tg->load_weight));
-#endif
-
-       print_cfs_group_stats(m, cpu, cfs_rq->tg);
-#endif
-}
-
-void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-       SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-#else
-       SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
-#define P(x) \
-       SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
-#define PN(x) \
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
-
-       P(rt_nr_running);
-       P(rt_throttled);
-       PN(rt_time);
-       PN(rt_runtime);
-
-#undef PN
-#undef P
-}
-
-extern __read_mostly int sched_clock_running;
-
-static void print_cpu(struct seq_file *m, int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
-#ifdef CONFIG_X86
-       {
-               unsigned int freq = cpu_khz ? : 1;
-
-               SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
-                          cpu, freq / 1000, (freq % 1000));
-       }
-#else
-       SEQ_printf(m, "\ncpu#%d\n", cpu);
-#endif
-
-#define P(x) \
-       SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
-#define PN(x) \
-       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
-
-       P(nr_running);
-       SEQ_printf(m, "  .%-30s: %lu\n", "load",
-                  rq->load.weight);
-       P(nr_switches);
-       P(nr_load_updates);
-       P(nr_uninterruptible);
-       PN(next_balance);
-       P(curr->pid);
-       PN(clock);
-       P(cpu_load[0]);
-       P(cpu_load[1]);
-       P(cpu_load[2]);
-       P(cpu_load[3]);
-       P(cpu_load[4]);
-#undef P
-#undef PN
-
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
-       P(yld_count);
-
-       P(sched_switch);
-       P(sched_count);
-       P(sched_goidle);
-#ifdef CONFIG_SMP
-       P64(avg_idle);
-#endif
-
-       P(ttwu_count);
-       P(ttwu_local);
-
-#undef P
-#undef P64
-#endif
-       spin_lock_irqsave(&sched_debug_lock, flags);
-       print_cfs_stats(m, cpu);
-       print_rt_stats(m, cpu);
-
-       rcu_read_lock();
-       print_rq(m, rq, cpu);
-       rcu_read_unlock();
-       spin_unlock_irqrestore(&sched_debug_lock, flags);
-}
-
-static const char *sched_tunable_scaling_names[] = {
-       "none",
-       "logaritmic",
-       "linear"
-};
-
-static int sched_debug_show(struct seq_file *m, void *v)
-{
-       u64 ktime, sched_clk, cpu_clk;
-       unsigned long flags;
-       int cpu;
-
-       local_irq_save(flags);
-       ktime = ktime_to_ns(ktime_get());
-       sched_clk = sched_clock();
-       cpu_clk = local_clock();
-       local_irq_restore(flags);
-
-       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-
-#define P(x) \
-       SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
-#define PN(x) \
-       SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-       PN(ktime);
-       PN(sched_clk);
-       PN(cpu_clk);
-       P(jiffies);
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-       P(sched_clock_stable);
-#endif
-#undef PN
-#undef P
-
-       SEQ_printf(m, "\n");
-       SEQ_printf(m, "sysctl_sched\n");
-
-#define P(x) \
-       SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
-#define PN(x) \
-       SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-       PN(sysctl_sched_latency);
-       PN(sysctl_sched_min_granularity);
-       PN(sysctl_sched_wakeup_granularity);
-       P(sysctl_sched_child_runs_first);
-       P(sysctl_sched_features);
-#undef PN
-#undef P
-
-       SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
-               sysctl_sched_tunable_scaling,
-               sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
-
-       for_each_online_cpu(cpu)
-               print_cpu(m, cpu);
-
-       SEQ_printf(m, "\n");
-
-       return 0;
-}
-
-static void sysrq_sched_debug_show(void)
-{
-       sched_debug_show(NULL, NULL);
-}
-
-static int sched_debug_open(struct inode *inode, struct file *filp)
-{
-       return single_open(filp, sched_debug_show, NULL);
-}
-
-static const struct file_operations sched_debug_fops = {
-       .open           = sched_debug_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int __init init_sched_debug_procfs(void)
-{
-       struct proc_dir_entry *pe;
-
-       pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
-       if (!pe)
-               return -ENOMEM;
-       return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{
-       unsigned long nr_switches;
-
-       SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
-                                               get_nr_threads(p));
-       SEQ_printf(m,
-               "---------------------------------------------------------\n");
-#define __P(F) \
-       SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
-#define P(F) \
-       SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
-#define __PN(F) \
-       SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
-       SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-
-       PN(se.exec_start);
-       PN(se.vruntime);
-       PN(se.sum_exec_runtime);
-
-       nr_switches = p->nvcsw + p->nivcsw;
-
-#ifdef CONFIG_SCHEDSTATS
-       PN(se.statistics.wait_start);
-       PN(se.statistics.sleep_start);
-       PN(se.statistics.block_start);
-       PN(se.statistics.sleep_max);
-       PN(se.statistics.block_max);
-       PN(se.statistics.exec_max);
-       PN(se.statistics.slice_max);
-       PN(se.statistics.wait_max);
-       PN(se.statistics.wait_sum);
-       P(se.statistics.wait_count);
-       PN(se.statistics.iowait_sum);
-       P(se.statistics.iowait_count);
-       P(se.nr_migrations);
-       P(se.statistics.nr_migrations_cold);
-       P(se.statistics.nr_failed_migrations_affine);
-       P(se.statistics.nr_failed_migrations_running);
-       P(se.statistics.nr_failed_migrations_hot);
-       P(se.statistics.nr_forced_migrations);
-       P(se.statistics.nr_wakeups);
-       P(se.statistics.nr_wakeups_sync);
-       P(se.statistics.nr_wakeups_migrate);
-       P(se.statistics.nr_wakeups_local);
-       P(se.statistics.nr_wakeups_remote);
-       P(se.statistics.nr_wakeups_affine);
-       P(se.statistics.nr_wakeups_affine_attempts);
-       P(se.statistics.nr_wakeups_passive);
-       P(se.statistics.nr_wakeups_idle);
-
-       {
-               u64 avg_atom, avg_per_cpu;
-
-               avg_atom = p->se.sum_exec_runtime;
-               if (nr_switches)
-                       do_div(avg_atom, nr_switches);
-               else
-                       avg_atom = -1LL;
-
-               avg_per_cpu = p->se.sum_exec_runtime;
-               if (p->se.nr_migrations) {
-                       avg_per_cpu = div64_u64(avg_per_cpu,
-                                               p->se.nr_migrations);
-               } else {
-                       avg_per_cpu = -1LL;
-               }
-
-               __PN(avg_atom);
-               __PN(avg_per_cpu);
-       }
-#endif
-       __P(nr_switches);
-       SEQ_printf(m, "%-35s:%21Ld\n",
-                  "nr_voluntary_switches", (long long)p->nvcsw);
-       SEQ_printf(m, "%-35s:%21Ld\n",
-                  "nr_involuntary_switches", (long long)p->nivcsw);
-
-       P(se.load.weight);
-       P(policy);
-       P(prio);
-#undef PN
-#undef __PN
-#undef P
-#undef __P
-
-       {
-               unsigned int this_cpu = raw_smp_processor_id();
-               u64 t0, t1;
-
-               t0 = cpu_clock(this_cpu);
-               t1 = cpu_clock(this_cpu);
-               SEQ_printf(m, "%-35s:%21Ld\n",
-                          "clock-delta", (long long)(t1-t0));
-       }
-}
-
-void proc_sched_set_task(struct task_struct *p)
-{
-#ifdef CONFIG_SCHEDSTATS
-       memset(&p->se.statistics, 0, sizeof(p->se.statistics));
-#endif
-}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
deleted file mode 100644 (file)
index 8a39fa3..0000000
+++ /dev/null
@@ -1,5090 +0,0 @@
-/*
- * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
- *
- *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- *
- *  Interactivity improvements by Mike Galbraith
- *  (C) 2007 Mike Galbraith <efault@gmx.de>
- *
- *  Various enhancements by Dmitry Adamushko.
- *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
- *
- *  Group scheduling enhancements by Srivatsa Vaddagiri
- *  Copyright IBM Corporation, 2007
- *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
- *
- *  Scaled math optimizations by Thomas Gleixner
- *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
- *
- *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
- *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- */
-
-#include <linux/latencytop.h>
-#include <linux/sched.h>
-#include <linux/cpumask.h>
-
-/*
- * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * NOTE: this latency value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS are of variable length
- * and have no persistent notion like in traditional, time-slice
- * based scheduling concepts.
- *
- * (to see the precise effective timeslice length of your workload,
- *  run vmstat and monitor the context-switches (cs) field)
- */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-
-/*
- * The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
- *
- * Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
- */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
-       = SCHED_TUNABLESCALING_LOG;
-
-/*
- * Minimal preemption granularity for CPU-bound tasks:
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
- */
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
-
-/*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
- */
-static unsigned int sched_nr_latency = 8;
-
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
-/*
- * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
- *
- * This option delays the preemption effects of decoupled workloads
- * and reduces their over-scheduling. Synchronous workloads will still
- * have immediate wakeup/sleep latencies.
- */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
-
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-
-/*
- * The exponential sliding  window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
- */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-
-#ifdef CONFIG_CFS_BANDWIDTH
-/*
- * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
- * each time a cfs_rq requests quota.
- *
- * Note: in the case that the slice exceeds the runtime remaining (either due
- * to consumption or the quota being specified to be smaller than the slice)
- * we will always only issue the remaining available time.
- *
- * default: 5 msec, units: microseconds
-  */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
-#endif
-
-static const struct sched_class fair_sched_class;
-
-/**************************************************************
- * CFS operations on generic schedulable entities:
- */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->rq;
-}
-
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)     (!se->my_q)
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       WARN_ON_ONCE(!entity_is_task(se));
-#endif
-       return container_of(se, struct task_struct, se);
-}
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
-               for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-       return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-       return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-       return grp->my_q;
-}
-
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       if (!cfs_rq->on_list) {
-               /*
-                * Ensure we either appear before our parent (if already
-                * enqueued) or force our parent to appear after us when it is
-                * enqueued.  The fact that we always enqueue bottom-up
-                * reduces this to two cases.
-                */
-               if (cfs_rq->tg->parent &&
-                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               } else {
-                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
-               }
-
-               cfs_rq->on_list = 1;
-       }
-}
-
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->on_list) {
-               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-               cfs_rq->on_list = 0;
-       }
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-       if (se->cfs_rq == pse->cfs_rq)
-               return 1;
-
-       return 0;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-       return se->parent;
-}
-
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-       int depth = 0;
-
-       for_each_sched_entity(se)
-               depth++;
-
-       return depth;
-}
-
-static void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
-       int se_depth, pse_depth;
-
-       /*
-        * preemption test can be made between sibling entities who are in the
-        * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-        * both tasks until we find their ancestors who are siblings of common
-        * parent.
-        */
-
-       /* First walk up until both entities are at same depth */
-       se_depth = depth_se(*se);
-       pse_depth = depth_se(*pse);
-
-       while (se_depth > pse_depth) {
-               se_depth--;
-               *se = parent_entity(*se);
-       }
-
-       while (pse_depth > se_depth) {
-               pse_depth--;
-               *pse = parent_entity(*pse);
-       }
-
-       while (!is_same_group(*se, *pse)) {
-               *se = parent_entity(*se);
-               *pse = parent_entity(*pse);
-       }
-}
-
-#else  /* !CONFIG_FAIR_GROUP_SCHED */
-
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-       return container_of(se, struct task_struct, se);
-}
-
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
-       return container_of(cfs_rq, struct rq, cfs);
-}
-
-#define entity_is_task(se)     1
-
-#define for_each_sched_entity(se) \
-               for (; se; se = NULL)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-       return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-       struct task_struct *p = task_of(se);
-       struct rq *rq = task_rq(p);
-
-       return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-       return NULL;
-}
-
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
-}
-
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-       return 1;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
-       return NULL;
-}
-
-static inline void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
-}
-
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                  unsigned long delta_exec);
-
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
- */
-
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
-{
-       s64 delta = (s64)(vruntime - min_vruntime);
-       if (delta > 0)
-               min_vruntime = vruntime;
-
-       return min_vruntime;
-}
-
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
-{
-       s64 delta = (s64)(vruntime - min_vruntime);
-       if (delta < 0)
-               min_vruntime = vruntime;
-
-       return min_vruntime;
-}
-
-static inline int entity_before(struct sched_entity *a,
-                               struct sched_entity *b)
-{
-       return (s64)(a->vruntime - b->vruntime) < 0;
-}
-
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
-       u64 vruntime = cfs_rq->min_vruntime;
-
-       if (cfs_rq->curr)
-               vruntime = cfs_rq->curr->vruntime;
-
-       if (cfs_rq->rb_leftmost) {
-               struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
-                                                  struct sched_entity,
-                                                  run_node);
-
-               if (!cfs_rq->curr)
-                       vruntime = se->vruntime;
-               else
-                       vruntime = min_vruntime(vruntime, se->vruntime);
-       }
-
-       cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
-       smp_wmb();
-       cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-
-/*
- * Enqueue an entity into the rb-tree:
- */
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-       struct rb_node *parent = NULL;
-       struct sched_entity *entry;
-       int leftmost = 1;
-
-       /*
-        * Find the right place in the rbtree:
-        */
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct sched_entity, run_node);
-               /*
-                * We dont care about collisions. Nodes with
-                * the same key stay together.
-                */
-               if (entity_before(se, entry)) {
-                       link = &parent->rb_left;
-               } else {
-                       link = &parent->rb_right;
-                       leftmost = 0;
-               }
-       }
-
-       /*
-        * Maintain a cache of leftmost tree entries (it is frequently
-        * used):
-        */
-       if (leftmost)
-               cfs_rq->rb_leftmost = &se->run_node;
-
-       rb_link_node(&se->run_node, parent, link);
-       rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
-}
-
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (cfs_rq->rb_leftmost == &se->run_node) {
-               struct rb_node *next_node;
-
-               next_node = rb_next(&se->run_node);
-               cfs_rq->rb_leftmost = next_node;
-       }
-
-       rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
-}
-
-static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
-{
-       struct rb_node *left = cfs_rq->rb_leftmost;
-
-       if (!left)
-               return NULL;
-
-       return rb_entry(left, struct sched_entity, run_node);
-}
-
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
-{
-       struct rb_node *next = rb_next(&se->run_node);
-
-       if (!next)
-               return NULL;
-
-       return rb_entry(next, struct sched_entity, run_node);
-}
-
-#ifdef CONFIG_SCHED_DEBUG
-static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
-{
-       struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
-
-       if (!last)
-               return NULL;
-
-       return rb_entry(last, struct sched_entity, run_node);
-}
-
-/**************************************************************
- * Scheduling class statistics methods:
- */
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
-{
-       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-       int factor = get_update_sysctl_factor();
-
-       if (ret || !write)
-               return ret;
-
-       sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
-                                       sysctl_sched_min_granularity);
-
-#define WRT_SYSCTL(name) \
-       (normalized_sysctl_##name = sysctl_##name / (factor))
-       WRT_SYSCTL(sched_min_granularity);
-       WRT_SYSCTL(sched_latency);
-       WRT_SYSCTL(sched_wakeup_granularity);
-#undef WRT_SYSCTL
-
-       return 0;
-}
-#endif
-
-/*
- * delta /= w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-       if (unlikely(se->load.weight != NICE_0_LOAD))
-               delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
-
-       return delta;
-}
-
-/*
- * The idea is to set a period in which each task runs once.
- *
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
- * this period because otherwise the slices get too small.
- *
- * p = (nr <= nl) ? l : l*nr/nl
- */
-static u64 __sched_period(unsigned long nr_running)
-{
-       u64 period = sysctl_sched_latency;
-       unsigned long nr_latency = sched_nr_latency;
-
-       if (unlikely(nr_running > nr_latency)) {
-               period = sysctl_sched_min_granularity;
-               period *= nr_running;
-       }
-
-       return period;
-}
-
-/*
- * We calculate the wall-time slice from the period by taking a part
- * proportional to the weight.
- *
- * s = p*P[w/rw]
- */
-static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
-
-       for_each_sched_entity(se) {
-               struct load_weight *load;
-               struct load_weight lw;
-
-               cfs_rq = cfs_rq_of(se);
-               load = &cfs_rq->load;
-
-               if (unlikely(!se->on_rq)) {
-                       lw = cfs_rq->load;
-
-                       update_load_add(&lw, se->load.weight);
-                       load = &lw;
-               }
-               slice = calc_delta_mine(slice, se->load.weight, load);
-       }
-       return slice;
-}
-
-/*
- * We calculate the vruntime slice of a to be inserted task
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static inline void
-__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
-             unsigned long delta_exec)
-{
-       unsigned long delta_exec_weighted;
-
-       schedstat_set(curr->statistics.exec_max,
-                     max((u64)delta_exec, curr->statistics.exec_max));
-
-       curr->sum_exec_runtime += delta_exec;
-       schedstat_add(cfs_rq, exec_clock, delta_exec);
-       delta_exec_weighted = calc_delta_fair(delta_exec, curr);
-
-       curr->vruntime += delta_exec_weighted;
-       update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
-}
-
-static void update_curr(struct cfs_rq *cfs_rq)
-{
-       struct sched_entity *curr = cfs_rq->curr;
-       u64 now = rq_of(cfs_rq)->clock_task;
-       unsigned long delta_exec;
-
-       if (unlikely(!curr))
-               return;
-
-       /*
-        * Get the amount of time the current task was running
-        * since the last time we changed load (this cannot
-        * overflow on 32 bits):
-        */
-       delta_exec = (unsigned long)(now - curr->exec_start);
-       if (!delta_exec)
-               return;
-
-       __update_curr(cfs_rq, curr, delta_exec);
-       curr->exec_start = now;
-
-       if (entity_is_task(curr)) {
-               struct task_struct *curtask = task_of(curr);
-
-               trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-               cpuacct_charge(curtask, delta_exec);
-               account_group_exec_runtime(curtask, delta_exec);
-       }
-
-       account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
-
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
-}
-
-/*
- * Task is being enqueued - update stats:
- */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       /*
-        * Are we enqueueing a waiting task? (for current tasks
-        * a dequeue/enqueue event is a NOP)
-        */
-       if (se != cfs_rq->curr)
-               update_stats_wait_start(cfs_rq, se);
-}
-
-static void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start));
-       schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
-       schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
-#ifdef CONFIG_SCHEDSTATS
-       if (entity_is_task(se)) {
-               trace_sched_stat_wait(task_of(se),
-                       rq_of(cfs_rq)->clock - se->statistics.wait_start);
-       }
-#endif
-       schedstat_set(se->statistics.wait_start, 0);
-}
-
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       /*
-        * Mark the end of the wait period if dequeueing a
-        * waiting task:
-        */
-       if (se != cfs_rq->curr)
-               update_stats_wait_end(cfs_rq, se);
-}
-
-/*
- * We are picking a new current task - update its stats:
- */
-static inline void
-update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       /*
-        * We are starting a new run period:
-        */
-       se->exec_start = rq_of(cfs_rq)->clock_task;
-}
-
-/**************************************************
- * Scheduling class queueing methods:
- */
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-       cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
-static void
-account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       update_load_add(&cfs_rq->load, se->load.weight);
-       if (!parent_entity(se))
-               inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-       if (entity_is_task(se)) {
-               add_cfs_task_weight(cfs_rq, se->load.weight);
-               list_add(&se->group_node, &cfs_rq->tasks);
-       }
-       cfs_rq->nr_running++;
-}
-
-static void
-account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       update_load_sub(&cfs_rq->load, se->load.weight);
-       if (!parent_entity(se))
-               dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-       if (entity_is_task(se)) {
-               add_cfs_task_weight(cfs_rq, -se->load.weight);
-               list_del_init(&se->group_node);
-       }
-       cfs_rq->nr_running--;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-# ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-                                           int global_update)
-{
-       struct task_group *tg = cfs_rq->tg;
-       long load_avg;
-
-       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-       load_avg -= cfs_rq->load_contribution;
-
-       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-               atomic_add(load_avg, &tg->load_weight);
-               cfs_rq->load_contribution += load_avg;
-       }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-       u64 period = sysctl_sched_shares_window;
-       u64 now, delta;
-       unsigned long load = cfs_rq->load.weight;
-
-       if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-               return;
-
-       now = rq_of(cfs_rq)->clock_task;
-       delta = now - cfs_rq->load_stamp;
-
-       /* truncate load history at 4 idle periods */
-       if (cfs_rq->load_stamp > cfs_rq->load_last &&
-           now - cfs_rq->load_last > 4 * period) {
-               cfs_rq->load_period = 0;
-               cfs_rq->load_avg = 0;
-               delta = period - 1;
-       }
-
-       cfs_rq->load_stamp = now;
-       cfs_rq->load_unacc_exec_time = 0;
-       cfs_rq->load_period += delta;
-       if (load) {
-               cfs_rq->load_last = now;
-               cfs_rq->load_avg += delta * load;
-       }
-
-       /* consider updating load contribution on each fold or truncate */
-       if (global_update || cfs_rq->load_period > period
-           || !cfs_rq->load_period)
-               update_cfs_rq_load_contribution(cfs_rq, global_update);
-
-       while (cfs_rq->load_period > period) {
-               /*
-                * Inline assembly required to prevent the compiler
-                * optimising this loop into a divmod call.
-                * See __iter_div_u64_rem() for another example of this.
-                */
-               asm("" : "+rm" (cfs_rq->load_period));
-               cfs_rq->load_period /= 2;
-               cfs_rq->load_avg /= 2;
-       }
-
-       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-               list_del_leaf_cfs_rq(cfs_rq);
-}
-
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
-{
-       long tg_weight;
-
-       /*
-        * Use this CPU's actual weight instead of the last load_contribution
-        * to gain a more accurate current total weight. See
-        * update_cfs_rq_load_contribution().
-        */
-       tg_weight = atomic_read(&tg->load_weight);
-       tg_weight -= cfs_rq->load_contribution;
-       tg_weight += cfs_rq->load.weight;
-
-       return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-       long tg_weight, load, shares;
-
-       tg_weight = calc_tg_weight(tg, cfs_rq);
-       load = cfs_rq->load.weight;
-
-       shares = (tg->shares * load);
-       if (tg_weight)
-               shares /= tg_weight;
-
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       if (shares > tg->shares)
-               shares = tg->shares;
-
-       return shares;
-}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-               update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq);
-       }
-}
-# else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-       return tg->shares;
-}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
-# endif /* CONFIG_SMP */
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-                           unsigned long weight)
-{
-       if (se->on_rq) {
-               /* commit outstanding execution time */
-               if (cfs_rq->curr == se)
-                       update_curr(cfs_rq);
-               account_entity_dequeue(cfs_rq, se);
-       }
-
-       update_load_set(&se->load, weight);
-
-       if (se->on_rq)
-               account_entity_enqueue(cfs_rq, se);
-}
-
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
-       struct task_group *tg;
-       struct sched_entity *se;
-       long shares;
-
-       tg = cfs_rq->tg;
-       se = tg->se[cpu_of(rq_of(cfs_rq))];
-       if (!se || throttled_hierarchy(cfs_rq))
-               return;
-#ifndef CONFIG_SMP
-       if (likely(se->load.weight == tg->shares))
-               return;
-#endif
-       shares = calc_cfs_shares(cfs_rq, tg);
-
-       reweight_entity(cfs_rq_of(se), se, shares);
-}
-#else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
-{
-}
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-       struct task_struct *tsk = NULL;
-
-       if (entity_is_task(se))
-               tsk = task_of(se);
-
-       if (se->statistics.sleep_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.sleep_max))
-                       se->statistics.sleep_max = delta;
-
-               se->statistics.sleep_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       account_scheduler_latency(tsk, delta >> 10, 1);
-                       trace_sched_stat_sleep(tsk, delta);
-               }
-       }
-       if (se->statistics.block_start) {
-               u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
-
-               if ((s64)delta < 0)
-                       delta = 0;
-
-               if (unlikely(delta > se->statistics.block_max))
-                       se->statistics.block_max = delta;
-
-               se->statistics.block_start = 0;
-               se->statistics.sum_sleep_runtime += delta;
-
-               if (tsk) {
-                       if (tsk->in_iowait) {
-                               se->statistics.iowait_sum += delta;
-                               se->statistics.iowait_count++;
-                               trace_sched_stat_iowait(tsk, delta);
-                       }
-
-                       /*
-                        * Blocking time is in units of nanosecs, so shift by
-                        * 20 to get a milliseconds-range estimation of the
-                        * amount of time that the task spent sleeping:
-                        */
-                       if (unlikely(prof_on == SLEEP_PROFILING)) {
-                               profile_hits(SLEEP_PROFILING,
-                                               (void *)get_wchan(tsk),
-                                               delta >> 20);
-                       }
-                       account_scheduler_latency(tsk, delta >> 10, 0);
-               }
-       }
-#endif
-}
-
-static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       s64 d = se->vruntime - cfs_rq->min_vruntime;
-
-       if (d < 0)
-               d = -d;
-
-       if (d > 3*sysctl_sched_latency)
-               schedstat_inc(cfs_rq, nr_spread_over);
-#endif
-}
-
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
-       u64 vruntime = cfs_rq->min_vruntime;
-
-       /*
-        * The 'current' period is already promised to the current tasks,
-        * however the extra weight of the new task will slow them down a
-        * little, place the new task so that it fits in the slot that
-        * stays open at the end.
-        */
-       if (initial && sched_feat(START_DEBIT))
-               vruntime += sched_vslice(cfs_rq, se);
-
-       /* sleeps up to a single latency don't count. */
-       if (!initial) {
-               unsigned long thresh = sysctl_sched_latency;
-
-               /*
-                * Halve their sleep time's effect, to allow
-                * for a gentler effect of sleepers:
-                */
-               if (sched_feat(GENTLE_FAIR_SLEEPERS))
-                       thresh >>= 1;
-
-               vruntime -= thresh;
-       }
-
-       /* ensure we never gain time by being placed backwards. */
-       vruntime = max_vruntime(se->vruntime, vruntime);
-
-       se->vruntime = vruntime;
-}
-
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-       /*
-        * Update the normalized vruntime before updating min_vruntime
-        * through callig update_curr().
-        */
-       if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
-               se->vruntime += cfs_rq->min_vruntime;
-
-       /*
-        * Update run-time statistics of the 'current'.
-        */
-       update_curr(cfs_rq);
-       update_cfs_load(cfs_rq, 0);
-       account_entity_enqueue(cfs_rq, se);
-       update_cfs_shares(cfs_rq);
-
-       if (flags & ENQUEUE_WAKEUP) {
-               place_entity(cfs_rq, se, 0);
-               enqueue_sleeper(cfs_rq, se);
-       }
-
-       update_stats_enqueue(cfs_rq, se);
-       check_spread(cfs_rq, se);
-       if (se != cfs_rq->curr)
-               __enqueue_entity(cfs_rq, se);
-       se->on_rq = 1;
-
-       if (cfs_rq->nr_running == 1) {
-               list_add_leaf_cfs_rq(cfs_rq);
-               check_enqueue_throttle(cfs_rq);
-       }
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->last == se)
-                       cfs_rq->last = NULL;
-               else
-                       break;
-       }
-}
-
-static void __clear_buddies_next(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->next == se)
-                       cfs_rq->next = NULL;
-               else
-                       break;
-       }
-}
-
-static void __clear_buddies_skip(struct sched_entity *se)
-{
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-               if (cfs_rq->skip == se)
-                       cfs_rq->skip = NULL;
-               else
-                       break;
-       }
-}
-
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (cfs_rq->last == se)
-               __clear_buddies_last(se);
-
-       if (cfs_rq->next == se)
-               __clear_buddies_next(se);
-
-       if (cfs_rq->skip == se)
-               __clear_buddies_skip(se);
-}
-
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-       /*
-        * Update run-time statistics of the 'current'.
-        */
-       update_curr(cfs_rq);
-
-       update_stats_dequeue(cfs_rq, se);
-       if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-               if (entity_is_task(se)) {
-                       struct task_struct *tsk = task_of(se);
-
-                       if (tsk->state & TASK_INTERRUPTIBLE)
-                               se->statistics.sleep_start = rq_of(cfs_rq)->clock;
-                       if (tsk->state & TASK_UNINTERRUPTIBLE)
-                               se->statistics.block_start = rq_of(cfs_rq)->clock;
-               }
-#endif
-       }
-
-       clear_buddies(cfs_rq, se);
-
-       if (se != cfs_rq->curr)
-               __dequeue_entity(cfs_rq, se);
-       se->on_rq = 0;
-       update_cfs_load(cfs_rq, 0);
-       account_entity_dequeue(cfs_rq, se);
-
-       /*
-        * Normalize the entity after updating the min_vruntime because the
-        * update can refer to the ->curr item and we need to reflect this
-        * movement in our normalized position.
-        */
-       if (!(flags & DEQUEUE_SLEEP))
-               se->vruntime -= cfs_rq->min_vruntime;
-
-       /* return excess runtime on last dequeue */
-       return_cfs_rq_runtime(cfs_rq);
-
-       update_min_vruntime(cfs_rq);
-       update_cfs_shares(cfs_rq);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
-       unsigned long ideal_runtime, delta_exec;
-       struct sched_entity *se;
-       s64 delta;
-
-       ideal_runtime = sched_slice(cfs_rq, curr);
-       delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-       if (delta_exec > ideal_runtime) {
-               resched_task(rq_of(cfs_rq)->curr);
-               /*
-                * The current task ran long enough, ensure it doesn't get
-                * re-elected due to buddy favours.
-                */
-               clear_buddies(cfs_rq, curr);
-               return;
-       }
-
-       /*
-        * Ensure that a task that missed wakeup preemption by a
-        * narrow margin doesn't have to wait for a full slice.
-        * This also mitigates buddy induced latencies under load.
-        */
-       if (delta_exec < sysctl_sched_min_granularity)
-               return;
-
-       se = __pick_first_entity(cfs_rq);
-       delta = curr->vruntime - se->vruntime;
-
-       if (delta < 0)
-               return;
-
-       if (delta > ideal_runtime)
-               resched_task(rq_of(cfs_rq)->curr);
-}
-
-static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       /* 'current' is not kept within the tree. */
-       if (se->on_rq) {
-               /*
-                * Any task has to be enqueued before it get to execute on
-                * a CPU. So account for the time it spent waiting on the
-                * runqueue.
-                */
-               update_stats_wait_end(cfs_rq, se);
-               __dequeue_entity(cfs_rq, se);
-       }
-
-       update_stats_curr_start(cfs_rq, se);
-       cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
-       /*
-        * Track our maximum slice length, if the CPU's load is at
-        * least twice that of our own weight (i.e. dont track it
-        * when there are only lesser-weight tasks around):
-        */
-       if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-               se->statistics.slice_max = max(se->statistics.slice_max,
-                       se->sum_exec_runtime - se->prev_sum_exec_runtime);
-       }
-#endif
-       se->prev_sum_exec_runtime = se->sum_exec_runtime;
-}
-
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
-/*
- * Pick the next process, keeping these things in mind, in this order:
- * 1) keep things fair between processes/task groups
- * 2) pick the "next" process, since someone really wants that to run
- * 3) pick the "last" process, for cache locality
- * 4) do not run the "skip" process, if something else is available
- */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
-{
-       struct sched_entity *se = __pick_first_entity(cfs_rq);
-       struct sched_entity *left = se;
-
-       /*
-        * Avoid running the skip buddy, if running something else can
-        * be done without getting too unfair.
-        */
-       if (cfs_rq->skip == se) {
-               struct sched_entity *second = __pick_next_entity(se);
-               if (second && wakeup_preempt_entity(second, left) < 1)
-                       se = second;
-       }
-
-       /*
-        * Prefer last buddy, try to return the CPU to a preempted task.
-        */
-       if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
-               se = cfs_rq->last;
-
-       /*
-        * Someone really wants this to run. If it's not unfair, run it.
-        */
-       if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
-               se = cfs_rq->next;
-
-       clear_buddies(cfs_rq, se);
-
-       return se;
-}
-
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
-
-static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
-{
-       /*
-        * If still on the runqueue then deactivate_task()
-        * was not called and update_curr() has to be done:
-        */
-       if (prev->on_rq)
-               update_curr(cfs_rq);
-
-       /* throttle cfs_rqs exceeding runtime */
-       check_cfs_rq_runtime(cfs_rq);
-
-       check_spread(cfs_rq, prev);
-       if (prev->on_rq) {
-               update_stats_wait_start(cfs_rq, prev);
-               /* Put 'current' back into the tree. */
-               __enqueue_entity(cfs_rq, prev);
-       }
-       cfs_rq->curr = NULL;
-}
-
-static void
-entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
-{
-       /*
-        * Update run-time statistics of the 'current'.
-        */
-       update_curr(cfs_rq);
-
-       /*
-        * Update share accounting for long-running entities.
-        */
-       update_entity_shares_tick(cfs_rq);
-
-#ifdef CONFIG_SCHED_HRTICK
-       /*
-        * queued ticks are scheduled to match the slice, so don't bother
-        * validating it and just reschedule.
-        */
-       if (queued) {
-               resched_task(rq_of(cfs_rq)->curr);
-               return;
-       }
-       /*
-        * don't let the period tick interfere with the hrtick preemption
-        */
-       if (!sched_feat(DOUBLE_TICK) &&
-                       hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
-               return;
-#endif
-
-       if (cfs_rq->nr_running > 1)
-               check_preempt_tick(cfs_rq, curr);
-}
-
-
-/**************************************************
- * CFS bandwidth control machinery
- */
-
-#ifdef CONFIG_CFS_BANDWIDTH
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
-       return 100000000ULL;
-}
-
-static inline u64 sched_cfs_bandwidth_slice(void)
-{
-       return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
-}
-
-/*
- * Replenish runtime according to assigned quota and update expiration time.
- * We use sched_clock_cpu directly instead of rq->clock to avoid adding
- * additional synchronization around rq->lock.
- *
- * requires cfs_b->lock
- */
-static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
-{
-       u64 now;
-
-       if (cfs_b->quota == RUNTIME_INF)
-               return;
-
-       now = sched_clock_cpu(smp_processor_id());
-       cfs_b->runtime = cfs_b->quota;
-       cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-}
-
-/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       struct task_group *tg = cfs_rq->tg;
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-       u64 amount = 0, min_amount, expires;
-
-       /* note: this is a positive sum as runtime_remaining <= 0 */
-       min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
-
-       raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota == RUNTIME_INF)
-               amount = min_amount;
-       else {
-               /*
-                * If the bandwidth pool has become inactive, then at least one
-                * period must have elapsed since the last consumption.
-                * Refresh the global state and ensure bandwidth timer becomes
-                * active.
-                */
-               if (!cfs_b->timer_active) {
-                       __refill_cfs_bandwidth_runtime(cfs_b);
-                       __start_cfs_bandwidth(cfs_b);
-               }
-
-               if (cfs_b->runtime > 0) {
-                       amount = min(cfs_b->runtime, min_amount);
-                       cfs_b->runtime -= amount;
-                       cfs_b->idle = 0;
-               }
-       }
-       expires = cfs_b->runtime_expires;
-       raw_spin_unlock(&cfs_b->lock);
-
-       cfs_rq->runtime_remaining += amount;
-       /*
-        * we may have advanced our local expiration to account for allowed
-        * spread between our sched_clock and the one on which runtime was
-        * issued.
-        */
-       if ((s64)(expires - cfs_rq->runtime_expires) > 0)
-               cfs_rq->runtime_expires = expires;
-
-       return cfs_rq->runtime_remaining > 0;
-}
-
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       struct rq *rq = rq_of(cfs_rq);
-
-       /* if the deadline is ahead of our clock, nothing to do */
-       if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
-               return;
-
-       if (cfs_rq->runtime_remaining < 0)
-               return;
-
-       /*
-        * If the local deadline has passed we have to consider the
-        * possibility that our sched_clock is 'fast' and the global deadline
-        * has not truly expired.
-        *
-        * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline has advanced.
-        */
-
-       if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
-               /* extend local deadline, drift is bounded above by 2 ticks */
-               cfs_rq->runtime_expires += TICK_NSEC;
-       } else {
-               /* global deadline is ahead, expiration has passed */
-               cfs_rq->runtime_remaining = 0;
-       }
-}
-
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec)
-{
-       /* dock delta_exec before expiring quota (as it could span periods) */
-       cfs_rq->runtime_remaining -= delta_exec;
-       expire_cfs_rq_runtime(cfs_rq);
-
-       if (likely(cfs_rq->runtime_remaining > 0))
-               return;
-
-       /*
-        * if we're unable to extend our runtime we resched so that the active
-        * hierarchy can be throttled
-        */
-       if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-               resched_task(rq_of(cfs_rq)->curr);
-}
-
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                                  unsigned long delta_exec)
-{
-       if (!cfs_rq->runtime_enabled)
-               return;
-
-       __account_cfs_rq_runtime(cfs_rq, delta_exec);
-}
-
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->throttled;
-}
-
-/* check whether cfs_rq, or any parent, is throttled */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq->throttle_count;
-}
-
-/*
- * Ensure that neither of the group entities corresponding to src_cpu or
- * dest_cpu are members of a throttled hierarchy when performing group
- * load-balance operations.
- */
-static inline int throttled_lb_pair(struct task_group *tg,
-                                   int src_cpu, int dest_cpu)
-{
-       struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
-
-       src_cfs_rq = tg->cfs_rq[src_cpu];
-       dest_cfs_rq = tg->cfs_rq[dest_cpu];
-
-       return throttled_hierarchy(src_cfs_rq) ||
-              throttled_hierarchy(dest_cfs_rq);
-}
-
-/* updated child weight may affect parent so we have to do this bottom up */
-static int tg_unthrottle_up(struct task_group *tg, void *data)
-{
-       struct rq *rq = data;
-       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-
-       cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
-       if (!cfs_rq->throttle_count) {
-               u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
-               /* leaving throttled state, advance shares averaging windows */
-               cfs_rq->load_stamp += delta;
-               cfs_rq->load_last += delta;
-
-               /* update entity weight now that we are on_rq again */
-               update_cfs_shares(cfs_rq);
-       }
-#endif
-
-       return 0;
-}
-
-static int tg_throttle_down(struct task_group *tg, void *data)
-{
-       struct rq *rq = data;
-       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-
-       /* group is entering throttled state, record last load */
-       if (!cfs_rq->throttle_count)
-               update_cfs_load(cfs_rq, 0);
-       cfs_rq->throttle_count++;
-
-       return 0;
-}
-
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       struct rq *rq = rq_of(cfs_rq);
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       struct sched_entity *se;
-       long task_delta, dequeue = 1;
-
-       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
-       /* account load preceding throttle */
-       rcu_read_lock();
-       walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
-       rcu_read_unlock();
-
-       task_delta = cfs_rq->h_nr_running;
-       for_each_sched_entity(se) {
-               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-               /* throttled entity or throttle-on-deactivate */
-               if (!se->on_rq)
-                       break;
-
-               if (dequeue)
-                       dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
-               qcfs_rq->h_nr_running -= task_delta;
-
-               if (qcfs_rq->load.weight)
-                       dequeue = 0;
-       }
-
-       if (!se)
-               rq->nr_running -= task_delta;
-
-       cfs_rq->throttled = 1;
-       cfs_rq->throttled_timestamp = rq->clock;
-       raw_spin_lock(&cfs_b->lock);
-       list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-       raw_spin_unlock(&cfs_b->lock);
-}
-
-static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
-{
-       struct rq *rq = rq_of(cfs_rq);
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       struct sched_entity *se;
-       int enqueue = 1;
-       long task_delta;
-
-       se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-
-       cfs_rq->throttled = 0;
-       raw_spin_lock(&cfs_b->lock);
-       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
-       list_del_rcu(&cfs_rq->throttled_list);
-       raw_spin_unlock(&cfs_b->lock);
-       cfs_rq->throttled_timestamp = 0;
-
-       update_rq_clock(rq);
-       /* update hierarchical throttle state */
-       walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
-
-       if (!cfs_rq->load.weight)
-               return;
-
-       task_delta = cfs_rq->h_nr_running;
-       for_each_sched_entity(se) {
-               if (se->on_rq)
-                       enqueue = 0;
-
-               cfs_rq = cfs_rq_of(se);
-               if (enqueue)
-                       enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
-               cfs_rq->h_nr_running += task_delta;
-
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
-       }
-
-       if (!se)
-               rq->nr_running += task_delta;
-
-       /* determine whether we need to wake up potentially idle cpu */
-       if (rq->curr == rq->idle && rq->cfs.nr_running)
-               resched_task(rq->curr);
-}
-
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-               u64 remaining, u64 expires)
-{
-       struct cfs_rq *cfs_rq;
-       u64 runtime = remaining;
-
-       rcu_read_lock();
-       list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
-                               throttled_list) {
-               struct rq *rq = rq_of(cfs_rq);
-
-               raw_spin_lock(&rq->lock);
-               if (!cfs_rq_throttled(cfs_rq))
-                       goto next;
-
-               runtime = -cfs_rq->runtime_remaining + 1;
-               if (runtime > remaining)
-                       runtime = remaining;
-               remaining -= runtime;
-
-               cfs_rq->runtime_remaining += runtime;
-               cfs_rq->runtime_expires = expires;
-
-               /* we check whether we're throttled above */
-               if (cfs_rq->runtime_remaining > 0)
-                       unthrottle_cfs_rq(cfs_rq);
-
-next:
-               raw_spin_unlock(&rq->lock);
-
-               if (!remaining)
-                       break;
-       }
-       rcu_read_unlock();
-
-       return remaining;
-}
-
-/*
- * Responsible for refilling a task_group's bandwidth and unthrottling its
- * cfs_rqs as appropriate. If there has been no activity within the last
- * period the timer is deactivated until scheduling resumes; cfs_b->idle is
- * used to track this state.
- */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
-{
-       u64 runtime, runtime_expires;
-       int idle = 1, throttled;
-
-       raw_spin_lock(&cfs_b->lock);
-       /* no need to continue the timer with no bandwidth constraint */
-       if (cfs_b->quota == RUNTIME_INF)
-               goto out_unlock;
-
-       throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-       /* idle depends on !throttled (for the case of a large deficit) */
-       idle = cfs_b->idle && !throttled;
-       cfs_b->nr_periods += overrun;
-
-       /* if we're going inactive then everything else can be deferred */
-       if (idle)
-               goto out_unlock;
-
-       __refill_cfs_bandwidth_runtime(cfs_b);
-
-       if (!throttled) {
-               /* mark as potentially idle for the upcoming period */
-               cfs_b->idle = 1;
-               goto out_unlock;
-       }
-
-       /* account preceding periods in which throttling occurred */
-       cfs_b->nr_throttled += overrun;
-
-       /*
-        * There are throttled entities so we must first use the new bandwidth
-        * to unthrottle them before making it generally available.  This
-        * ensures that all existing debts will be paid before a new cfs_rq is
-        * allowed to run.
-        */
-       runtime = cfs_b->runtime;
-       runtime_expires = cfs_b->runtime_expires;
-       cfs_b->runtime = 0;
-
-       /*
-        * This check is repeated as we are holding onto the new bandwidth
-        * while we unthrottle.  This can potentially race with an unthrottled
-        * group trying to acquire new bandwidth from the global pool.
-        */
-       while (throttled && runtime > 0) {
-               raw_spin_unlock(&cfs_b->lock);
-               /* we can't nest cfs_b->lock while distributing bandwidth */
-               runtime = distribute_cfs_runtime(cfs_b, runtime,
-                                                runtime_expires);
-               raw_spin_lock(&cfs_b->lock);
-
-               throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-       }
-
-       /* return (any) remaining runtime */
-       cfs_b->runtime = runtime;
-       /*
-        * While we are ensured activity in the period following an
-        * unthrottle, this also covers the case in which the new bandwidth is
-        * insufficient to cover the existing bandwidth deficit.  (Forcing the
-        * timer to remain active while there are any throttled entities.)
-        */
-       cfs_b->idle = 0;
-out_unlock:
-       if (idle)
-               cfs_b->timer_active = 0;
-       raw_spin_unlock(&cfs_b->lock);
-
-       return idle;
-}
-
-/* a cfs_rq won't donate quota below this amount */
-static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
-/* minimum remaining period time to redistribute slack quota */
-static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
-/* how long we wait to gather additional slack before distributing */
-static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-
-/* are we near the end of the current quota period? */
-static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
-{
-       struct hrtimer *refresh_timer = &cfs_b->period_timer;
-       u64 remaining;
-
-       /* if the call-back is running a quota refresh is already occurring */
-       if (hrtimer_callback_running(refresh_timer))
-               return 1;
-
-       /* is a quota refresh about to occur? */
-       remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
-       if (remaining < min_expire)
-               return 1;
-
-       return 0;
-}
-
-static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-       u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
-
-       /* if there's a quota refresh soon don't bother with slack */
-       if (runtime_refresh_within(cfs_b, min_left))
-               return;
-
-       start_bandwidth_timer(&cfs_b->slack_timer,
-                               ns_to_ktime(cfs_bandwidth_slack_period));
-}
-
-/* we know any runtime found here is valid as update_curr() precedes return */
-static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-       s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
-
-       if (slack_runtime <= 0)
-               return;
-
-       raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF &&
-           cfs_rq->runtime_expires == cfs_b->runtime_expires) {
-               cfs_b->runtime += slack_runtime;
-
-               /* we are under rq->lock, defer unthrottling using a timer */
-               if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
-                   !list_empty(&cfs_b->throttled_cfs_rq))
-                       start_cfs_slack_bandwidth(cfs_b);
-       }
-       raw_spin_unlock(&cfs_b->lock);
-
-       /* even if it's not valid for return we don't want to try again */
-       cfs_rq->runtime_remaining -= slack_runtime;
-}
-
-static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
-               return;
-
-       __return_cfs_rq_runtime(cfs_rq);
-}
-
-/*
- * This is done with a timer (instead of inline with bandwidth return) since
- * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
- */
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
-{
-       u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
-       u64 expires;
-
-       /* confirm we're still not at a refresh boundary */
-       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
-               return;
-
-       raw_spin_lock(&cfs_b->lock);
-       if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
-               runtime = cfs_b->runtime;
-               cfs_b->runtime = 0;
-       }
-       expires = cfs_b->runtime_expires;
-       raw_spin_unlock(&cfs_b->lock);
-
-       if (!runtime)
-               return;
-
-       runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
-
-       raw_spin_lock(&cfs_b->lock);
-       if (expires == cfs_b->runtime_expires)
-               cfs_b->runtime = runtime;
-       raw_spin_unlock(&cfs_b->lock);
-}
-
-/*
- * When a group wakes up we want to make sure that its quota is not already
- * expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
- */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
-{
-       /* an active group must be handled by the update_curr()->put() path */
-       if (!cfs_rq->runtime_enabled || cfs_rq->curr)
-               return;
-
-       /* ensure the group is not already throttled */
-       if (cfs_rq_throttled(cfs_rq))
-               return;
-
-       /* update runtime allocation */
-       account_cfs_rq_runtime(cfs_rq, 0);
-       if (cfs_rq->runtime_remaining <= 0)
-               throttle_cfs_rq(cfs_rq);
-}
-
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-       if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-               return;
-
-       /*
-        * it's possible for a throttled entity to be forced into a running
-        * state (e.g. set_curr_task), in this case we're finished.
-        */
-       if (cfs_rq_throttled(cfs_rq))
-               return;
-
-       throttle_cfs_rq(cfs_rq);
-}
-#else
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
-                                    unsigned long delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-       return 0;
-}
-
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
-{
-       return 0;
-}
-
-static inline int throttled_lb_pair(struct task_group *tg,
-                                   int src_cpu, int dest_cpu)
-{
-       return 0;
-}
-#endif
-
-/**************************************************
- * CFS operations on tasks:
- */
-
-#ifdef CONFIG_SCHED_HRTICK
-static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-       WARN_ON(task_rq(p) != rq);
-
-       if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
-               u64 slice = sched_slice(cfs_rq, se);
-               u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
-               s64 delta = slice - ran;
-
-               if (delta < 0) {
-                       if (rq->curr == p)
-                               resched_task(p);
-                       return;
-               }
-
-               /*
-                * Don't schedule slices shorter than 10000ns, that just
-                * doesn't make sense. Rely on vruntime for fairness.
-                */
-               if (rq->curr != p)
-                       delta = max_t(s64, 10000LL, delta);
-
-               hrtick_start(rq, delta);
-       }
-}
-
-/*
- * called from enqueue/dequeue and updates the hrtick when the
- * current task is from our class and nr_running is low enough
- * to matter.
- */
-static void hrtick_update(struct rq *rq)
-{
-       struct task_struct *curr = rq->curr;
-
-       if (curr->sched_class != &fair_sched_class)
-               return;
-
-       if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
-               hrtick_start_fair(rq, curr);
-}
-#else /* !CONFIG_SCHED_HRTICK */
-static inline void
-hrtick_start_fair(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void hrtick_update(struct rq *rq)
-{
-}
-#endif
-
-/*
- * The enqueue_task method is called before nr_running is
- * increased. Here we update the fair scheduling stats and
- * then put the task into the rbtree:
- */
-static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
-{
-       struct cfs_rq *cfs_rq;
-       struct sched_entity *se = &p->se;
-
-       for_each_sched_entity(se) {
-               if (se->on_rq)
-                       break;
-               cfs_rq = cfs_rq_of(se);
-               enqueue_entity(cfs_rq, se, flags);
-
-               /*
-                * end evaluation on encountering a throttled cfs_rq
-                *
-                * note: in the case of encountering a throttled cfs_rq we will
-                * post the final h_nr_running increment below.
-               */
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
-               cfs_rq->h_nr_running++;
-
-               flags = ENQUEUE_WAKEUP;
-       }
-
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               cfs_rq->h_nr_running++;
-
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
-
-               update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq);
-       }
-
-       if (!se)
-               inc_nr_running(rq);
-       hrtick_update(rq);
-}
-
-static void set_next_buddy(struct sched_entity *se);
-
-/*
- * The dequeue_task method is called before nr_running is
- * decreased. We remove the task from the rbtree and
- * update the fair scheduling stats:
- */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
-{
-       struct cfs_rq *cfs_rq;
-       struct sched_entity *se = &p->se;
-       int task_sleep = flags & DEQUEUE_SLEEP;
-
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               dequeue_entity(cfs_rq, se, flags);
-
-               /*
-                * end evaluation on encountering a throttled cfs_rq
-                *
-                * note: in the case of encountering a throttled cfs_rq we will
-                * post the final h_nr_running decrement below.
-               */
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
-               cfs_rq->h_nr_running--;
-
-               /* Don't dequeue parent if it has other entities besides us */
-               if (cfs_rq->load.weight) {
-                       /*
-                        * Bias pick_next to pick a task from this cfs_rq, as
-                        * p is sleeping when it is within its sched_slice.
-                        */
-                       if (task_sleep && parent_entity(se))
-                               set_next_buddy(parent_entity(se));
-
-                       /* avoid re-evaluating load for this entity */
-                       se = parent_entity(se);
-                       break;
-               }
-               flags |= DEQUEUE_SLEEP;
-       }
-
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               cfs_rq->h_nr_running--;
-
-               if (cfs_rq_throttled(cfs_rq))
-                       break;
-
-               update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq);
-       }
-
-       if (!se)
-               dec_nr_running(rq);
-       hrtick_update(rq);
-}
-
-#ifdef CONFIG_SMP
-
-static void task_waking_fair(struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-       u64 min_vruntime_copy;
-
-       do {
-               min_vruntime_copy = cfs_rq->min_vruntime_copy;
-               smp_rmb();
-               min_vruntime = cfs_rq->min_vruntime;
-       } while (min_vruntime != min_vruntime_copy);
-#else
-       min_vruntime = cfs_rq->min_vruntime;
-#endif
-
-       se->vruntime -= min_vruntime;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- *   s_i = rw_i / \Sum rw_j                                            (1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- *   rw_i = {   2,   4,   1,   0 }
- *   s_i  = { 2/7, 4/7, 1/7,   0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                           (2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- *   rw'_i = {   3,   4,   1,   0 }
- *   s'_i  = { 3/8, 4/8, 1/8,   0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- *   dw_i = S * (s'_i - s_i)                                           (3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
-       struct sched_entity *se = tg->se[cpu];
-
-       if (!tg->parent)        /* the trivial, non-cgroup case */
-               return wl;
-
-       for_each_sched_entity(se) {
-               long w, W;
-
-               tg = se->my_q->tg;
-
-               /*
-                * W = @wg + \Sum rw_j
-                */
-               W = wg + calc_tg_weight(tg, se->my_q);
-
-               /*
-                * w = rw_i + @wl
-                */
-               w = se->my_q->load.weight + wl;
-
-               /*
-                * wl = S * s'_i; see (2)
-                */
-               if (W > 0 && w < W)
-                       wl = (w * tg->shares) / W;
-               else
-                       wl = tg->shares;
-
-               /*
-                * Per the above, wl is the new se->load.weight value; since
-                * those are clipped to [MIN_SHARES, ...) do so now. See
-                * calc_cfs_shares().
-                */
-               if (wl < MIN_SHARES)
-                       wl = MIN_SHARES;
-
-               /*
-                * wl = dw_i = S * (s'_i - s_i); see (3)
-                */
-               wl -= se->load.weight;
-
-               /*
-                * Recursively apply this logic to all parent groups to compute
-                * the final effective load change on the root group. Since
-                * only the @tg group gets extra weight, all parent groups can
-                * only redistribute existing shares. @wl is the shift in shares
-                * resulting from this level per the above.
-                */
-               wg = 0;
-       }
-
-       return wl;
-}
-#else
-
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
-{
-       return wl;
-}
-
-#endif
-
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
-{
-       s64 this_load, load;
-       int idx, this_cpu, prev_cpu;
-       unsigned long tl_per_task;
-       struct task_group *tg;
-       unsigned long weight;
-       int balanced;
-
-       idx       = sd->wake_idx;
-       this_cpu  = smp_processor_id();
-       prev_cpu  = task_cpu(p);
-       load      = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
-
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current CPU:
-        */
-       if (sync) {
-               tg = task_group(current);
-               weight = current->se.load.weight;
-
-               this_load += effective_load(tg, this_cpu, -weight, -weight);
-               load += effective_load(tg, prev_cpu, 0, -weight);
-       }
-
-       tg = task_group(p);
-       weight = p->se.load.weight;
-
-       /*
-        * In low-load situations, where prev_cpu is idle and this_cpu is idle
-        * due to the sync cause above having dropped this_load to 0, we'll
-        * always have an imbalance, but there's really nothing you can do
-        * about that, so that's good too.
-        *
-        * Otherwise check if either cpus are near enough in load to allow this
-        * task to be woken on this_cpu.
-        */
-       if (this_load > 0) {
-               s64 this_eff_load, prev_eff_load;
-
-               this_eff_load = 100;
-               this_eff_load *= power_of(prev_cpu);
-               this_eff_load *= this_load +
-                       effective_load(tg, this_cpu, weight, weight);
-
-               prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-               prev_eff_load *= power_of(this_cpu);
-               prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
-
-               balanced = this_eff_load <= prev_eff_load;
-       } else
-               balanced = true;
-
-       /*
-        * If the currently running task will sleep within
-        * a reasonable amount of time then attract this newly
-        * woken task:
-        */
-       if (sync && balanced)
-               return 1;
-
-       schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-       tl_per_task = cpu_avg_load_per_task(this_cpu);
-
-       if (balanced ||
-           (this_load <= load &&
-            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-               /*
-                * This domain has SD_WAKE_AFFINE and
-                * p is cache cold in this domain, and
-                * there is no bad imbalance.
-                */
-               schedstat_inc(sd, ttwu_move_affine);
-               schedstat_inc(p, se.statistics.nr_wakeups_affine);
-
-               return 1;
-       }
-       return 0;
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int load_idx)
-{
-       struct sched_group *idlest = NULL, *group = sd->groups;
-       unsigned long min_load = ULONG_MAX, this_load = 0;
-       int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
-       do {
-               unsigned long load, avg_load;
-               int local_group;
-               int i;
-
-               /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_cpus(group),
-                                       tsk_cpus_allowed(p)))
-                       continue;
-
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_cpus(group));
-
-               /* Tally up the load of all CPUs in the group */
-               avg_load = 0;
-
-               for_each_cpu(i, sched_group_cpus(group)) {
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group)
-                               load = source_load(i, load_idx);
-                       else
-                               load = target_load(i, load_idx);
-
-                       avg_load += load;
-               }
-
-               /* Adjust by relative CPU power of the group */
-               avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
-
-               if (local_group) {
-                       this_load = avg_load;
-               } else if (avg_load < min_load) {
-                       min_load = avg_load;
-                       idlest = group;
-               }
-       } while (group = group->next, group != sd->groups);
-
-       if (!idlest || 100*this_load < imbalance*min_load)
-               return NULL;
-       return idlest;
-}
-
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
-       unsigned long load, min_load = ULONG_MAX;
-       int idlest = -1;
-       int i;
-
-       /* Traverse only the allowed CPUs */
-       for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               load = weighted_cpuload(i);
-
-               if (load < min_load || (load == min_load && i == this_cpu)) {
-                       min_load = load;
-                       idlest = i;
-               }
-       }
-
-       return idlest;
-}
-
-/*
- * Try and locate an idle CPU in the sched_domain.
- */
-static int select_idle_sibling(struct task_struct *p, int target)
-{
-       int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
-       struct sched_domain *sd;
-       struct sched_group *sg;
-       int i, smt = 0;
-
-       /*
-        * If the task is going to be woken-up on this cpu and if it is
-        * already idle, then it is the right target.
-        */
-       if (target == cpu && idle_cpu(cpu))
-               return cpu;
-
-       /*
-        * If the task is going to be woken-up on the cpu where it previously
-        * ran and if it is currently idle, then it the right target.
-        */
-       if (target == prev_cpu && idle_cpu(prev_cpu))
-               return prev_cpu;
-
-       /*
-        * Otherwise, iterate the domains and find an elegible idle cpu.
-        */
-       rcu_read_lock();
-again:
-       for_each_domain(target, sd) {
-               if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
-                       continue;
-
-               if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
-                       break;
-
-               if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
-                       break;
-
-               sg = sd->groups;
-               do {
-                       if (!cpumask_intersects(sched_group_cpus(sg),
-                                               tsk_cpus_allowed(p)))
-                               goto next;
-
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (!idle_cpu(i))
-                                       goto next;
-                       }
-
-                       target = cpumask_first_and(sched_group_cpus(sg),
-                                       tsk_cpus_allowed(p));
-                       goto done;
-next:
-                       sg = sg->next;
-               } while (sg != sd->groups);
-       }
-       if (!smt) {
-               smt = 1;
-               goto again;
-       }
-done:
-       rcu_read_unlock();
-
-       return target;
-}
-
-/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
- *
- * Balance, ie. select the least loaded group.
- *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
- *
- * preempt must be disabled.
- */
-static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
-{
-       struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
-       int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
-       int new_cpu = cpu;
-       int want_affine = 0;
-       int want_sd = 1;
-       int sync = wake_flags & WF_SYNC;
-
-       if (sd_flag & SD_BALANCE_WAKE) {
-               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-                       want_affine = 1;
-               new_cpu = prev_cpu;
-       }
-
-       rcu_read_lock();
-       for_each_domain(cpu, tmp) {
-               if (!(tmp->flags & SD_LOAD_BALANCE))
-                       continue;
-
-               /*
-                * If power savings logic is enabled for a domain, see if we
-                * are not overloaded, if so, don't balance wider.
-                */
-               if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
-                       unsigned long power = 0;
-                       unsigned long nr_running = 0;
-                       unsigned long capacity;
-                       int i;
-
-                       for_each_cpu(i, sched_domain_span(tmp)) {
-                               power += power_of(i);
-                               nr_running += cpu_rq(i)->cfs.nr_running;
-                       }
-
-                       capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-
-                       if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-                               nr_running /= 2;
-
-                       if (nr_running < capacity)
-                               want_sd = 0;
-               }
-
-               /*
-                * If both cpu and prev_cpu are part of this domain,
-                * cpu is a valid SD_WAKE_AFFINE target.
-                */
-               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
-                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-                       affine_sd = tmp;
-                       want_affine = 0;
-               }
-
-               if (!want_sd && !want_affine)
-                       break;
-
-               if (!(tmp->flags & sd_flag))
-                       continue;
-
-               if (want_sd)
-                       sd = tmp;
-       }
-
-       if (affine_sd) {
-               if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-                       prev_cpu = cpu;
-
-               new_cpu = select_idle_sibling(p, prev_cpu);
-               goto unlock;
-       }
-
-       while (sd) {
-               int load_idx = sd->forkexec_idx;
-               struct sched_group *group;
-               int weight;
-
-               if (!(sd->flags & sd_flag)) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               if (sd_flag & SD_BALANCE_WAKE)
-                       load_idx = sd->wake_idx;
-
-               group = find_idlest_group(sd, p, cpu, load_idx);
-               if (!group) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               new_cpu = find_idlest_cpu(group, p, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       /* Now try balancing at a lower domain level of cpu */
-                       sd = sd->child;
-                       continue;
-               }
-
-               /* Now try balancing at a lower domain level of new_cpu */
-               cpu = new_cpu;
-               weight = sd->span_weight;
-               sd = NULL;
-               for_each_domain(cpu, tmp) {
-                       if (weight <= tmp->span_weight)
-                               break;
-                       if (tmp->flags & sd_flag)
-                               sd = tmp;
-               }
-               /* while loop will break here if sd == NULL */
-       }
-unlock:
-       rcu_read_unlock();
-
-       return new_cpu;
-}
-#endif /* CONFIG_SMP */
-
-static unsigned long
-wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-       unsigned long gran = sysctl_sched_wakeup_granularity;
-
-       /*
-        * Since its curr running now, convert the gran from real-time
-        * to virtual-time in his units.
-        *
-        * By using 'se' instead of 'curr' we penalize light tasks, so
-        * they get preempted easier. That is, if 'se' < 'curr' then
-        * the resulting gran will be larger, therefore penalizing the
-        * lighter, if otoh 'se' > 'curr' then the resulting gran will
-        * be smaller, again penalizing the lighter task.
-        *
-        * This is especially important for buddies when the leftmost
-        * task is higher priority than the buddy.
-        */
-       return calc_delta_fair(gran, se);
-}
-
-/*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-       s64 gran, vdiff = curr->vruntime - se->vruntime;
-
-       if (vdiff <= 0)
-               return -1;
-
-       gran = wakeup_gran(curr, se);
-       if (vdiff > gran)
-               return 1;
-
-       return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
-{
-       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-               return;
-
-       for_each_sched_entity(se)
-               cfs_rq_of(se)->last = se;
-}
-
-static void set_next_buddy(struct sched_entity *se)
-{
-       if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
-               return;
-
-       for_each_sched_entity(se)
-               cfs_rq_of(se)->next = se;
-}
-
-static void set_skip_buddy(struct sched_entity *se)
-{
-       for_each_sched_entity(se)
-               cfs_rq_of(se)->skip = se;
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
-{
-       struct task_struct *curr = rq->curr;
-       struct sched_entity *se = &curr->se, *pse = &p->se;
-       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       int scale = cfs_rq->nr_running >= sched_nr_latency;
-       int next_buddy_marked = 0;
-
-       if (unlikely(se == pse))
-               return;
-
-       /*
-        * This is possible from callers such as pull_task(), in which we
-        * unconditionally check_prempt_curr() after an enqueue (which may have
-        * lead to a throttle).  This both saves work and prevents false
-        * next-buddy nomination below.
-        */
-       if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
-               return;
-
-       if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
-               set_next_buddy(pse);
-               next_buddy_marked = 1;
-       }
-
-       /*
-        * We can come here with TIF_NEED_RESCHED already set from new task
-        * wake up path.
-        *
-        * Note: this also catches the edge-case of curr being in a throttled
-        * group (e.g. via set_curr_task), since update_curr() (in the
-        * enqueue of curr) will have resulted in resched being set.  This
-        * prevents us from potentially nominating it as a false LAST_BUDDY
-        * below.
-        */
-       if (test_tsk_need_resched(curr))
-               return;
-
-       /* Idle tasks are by definition preempted by non-idle tasks. */
-       if (unlikely(curr->policy == SCHED_IDLE) &&
-           likely(p->policy != SCHED_IDLE))
-               goto preempt;
-
-       /*
-        * Batch and idle tasks do not preempt non-idle tasks (their preemption
-        * is driven by the tick):
-        */
-       if (unlikely(p->policy != SCHED_NORMAL))
-               return;
-
-       find_matching_se(&se, &pse);
-       update_curr(cfs_rq_of(se));
-       BUG_ON(!pse);
-       if (wakeup_preempt_entity(se, pse) == 1) {
-               /*
-                * Bias pick_next to pick the sched entity that is
-                * triggering this preemption.
-                */
-               if (!next_buddy_marked)
-                       set_next_buddy(pse);
-               goto preempt;
-       }
-
-       return;
-
-preempt:
-       resched_task(curr);
-       /*
-        * Only set the backward buddy when the current task is still
-        * on the rq. This can happen when a wakeup gets interleaved
-        * with schedule on the ->pre_schedule() or idle_balance()
-        * point, either of which can * drop the rq lock.
-        *
-        * Also, during early boot the idle thread is in the fair class,
-        * for obvious reasons its a bad idea to schedule back to it.
-        */
-       if (unlikely(!se->on_rq || curr == rq->idle))
-               return;
-
-       if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
-               set_last_buddy(se);
-}
-
-static struct task_struct *pick_next_task_fair(struct rq *rq)
-{
-       struct task_struct *p;
-       struct cfs_rq *cfs_rq = &rq->cfs;
-       struct sched_entity *se;
-
-       if (!cfs_rq->nr_running)
-               return NULL;
-
-       do {
-               se = pick_next_entity(cfs_rq);
-               set_next_entity(cfs_rq, se);
-               cfs_rq = group_cfs_rq(se);
-       } while (cfs_rq);
-
-       p = task_of(se);
-       hrtick_start_fair(rq, p);
-
-       return p;
-}
-
-/*
- * Account for a descheduled task:
- */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
-{
-       struct sched_entity *se = &prev->se;
-       struct cfs_rq *cfs_rq;
-
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               put_prev_entity(cfs_rq, se);
-       }
-}
-
-/*
- * sched_yield() is very simple
- *
- * The magic of dealing with the ->skip buddy is in pick_next_entity.
- */
-static void yield_task_fair(struct rq *rq)
-{
-       struct task_struct *curr = rq->curr;
-       struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-       struct sched_entity *se = &curr->se;
-
-       /*
-        * Are we the only task in the tree?
-        */
-       if (unlikely(rq->nr_running == 1))
-               return;
-
-       clear_buddies(cfs_rq, se);
-
-       if (curr->policy != SCHED_BATCH) {
-               update_rq_clock(rq);
-               /*
-                * Update run-time statistics of the 'current'.
-                */
-               update_curr(cfs_rq);
-       }
-
-       set_skip_buddy(se);
-}
-
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
-{
-       struct sched_entity *se = &p->se;
-
-       /* throttled hierarchies are not runnable */
-       if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
-               return false;
-
-       /* Tell the scheduler that we'd really like pse to run next. */
-       set_next_buddy(se);
-
-       yield_task_fair(rq);
-
-       return true;
-}
-
-#ifdef CONFIG_SMP
-/**************************************************
- * Fair scheduling class load-balancing methods:
- */
-
-/*
- * pull_task - move a task from a remote runqueue to the local runqueue.
- * Both runqueues must be locked.
- */
-static void pull_task(struct rq *src_rq, struct task_struct *p,
-                     struct rq *this_rq, int this_cpu)
-{
-       deactivate_task(src_rq, p, 0);
-       set_task_cpu(p, this_cpu);
-       activate_task(this_rq, p, 0);
-       check_preempt_curr(this_rq, p, 0);
-}
-
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
-                    struct sched_domain *sd, enum cpu_idle_type idle,
-                    int *all_pinned)
-{
-       int tsk_cache_hot = 0;
-       /*
-        * We do not migrate tasks that are:
-        * 1) running (obviously), or
-        * 2) cannot be migrated to this CPU due to cpus_allowed, or
-        * 3) are cache-hot on their current CPU.
-        */
-       if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
-               schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
-               return 0;
-       }
-       *all_pinned = 0;
-
-       if (task_running(rq, p)) {
-               schedstat_inc(p, se.statistics.nr_failed_migrations_running);
-               return 0;
-       }
-
-       /*
-        * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
-        */
-
-       tsk_cache_hot = task_hot(p, rq->clock_task, sd);
-       if (!tsk_cache_hot ||
-               sd->nr_balance_failed > sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
-               if (tsk_cache_hot) {
-                       schedstat_inc(sd, lb_hot_gained[idle]);
-                       schedstat_inc(p, se.statistics.nr_forced_migrations);
-               }
-#endif
-               return 1;
-       }
-
-       if (tsk_cache_hot) {
-               schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
-               return 0;
-       }
-       return 1;
-}
-
-/*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
- * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int
-move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
-             struct sched_domain *sd, enum cpu_idle_type idle)
-{
-       struct task_struct *p, *n;
-       struct cfs_rq *cfs_rq;
-       int pinned = 0;
-
-       for_each_leaf_cfs_rq(busiest, cfs_rq) {
-               list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
-                       if (throttled_lb_pair(task_group(p),
-                                             busiest->cpu, this_cpu))
-                               break;
-
-                       if (!can_migrate_task(p, busiest, this_cpu,
-                                               sd, idle, &pinned))
-                               continue;
-
-                       pull_task(busiest, p, this_rq, this_cpu);
-                       /*
-                        * Right now, this is only the second place pull_task()
-                        * is called, so we can safely collect pull_task()
-                        * stats here rather than inside pull_task().
-                        */
-                       schedstat_inc(sd, lb_gained[idle]);
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-
-static unsigned long
-balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-             unsigned long max_load_move, struct sched_domain *sd,
-             enum cpu_idle_type idle, int *all_pinned,
-             struct cfs_rq *busiest_cfs_rq)
-{
-       int loops = 0, pulled = 0;
-       long rem_load_move = max_load_move;
-       struct task_struct *p, *n;
-
-       if (max_load_move == 0)
-               goto out;
-
-       list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
-               if (loops++ > sysctl_sched_nr_migrate)
-                       break;
-
-               if ((p->se.load.weight >> 1) > rem_load_move ||
-                   !can_migrate_task(p, busiest, this_cpu, sd, idle,
-                                     all_pinned))
-                       continue;
-
-               pull_task(busiest, p, this_rq, this_cpu);
-               pulled++;
-               rem_load_move -= p->se.load.weight;
-
-#ifdef CONFIG_PREEMPT
-               /*
-                * NEWIDLE balancing is a source of latency, so preemptible
-                * kernels will stop after the first task is pulled to minimize
-                * the critical section.
-                */
-               if (idle == CPU_NEWLY_IDLE)
-                       break;
-#endif
-
-               /*
-                * We only want to steal up to the prescribed amount of
-                * weighted load.
-                */
-               if (rem_load_move <= 0)
-                       break;
-       }
-out:
-       /*
-        * Right now, this is one of only two places pull_task() is called,
-        * so we can safely collect pull_task() stats here rather than
-        * inside pull_task().
-        */
-       schedstat_add(sd, lb_gained[idle], pulled);
-
-       return max_load_move - rem_load_move;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static int update_shares_cpu(struct task_group *tg, int cpu)
-{
-       struct cfs_rq *cfs_rq;
-       unsigned long flags;
-       struct rq *rq;
-
-       if (!tg->se[cpu])
-               return 0;
-
-       rq = cpu_rq(cpu);
-       cfs_rq = tg->cfs_rq[cpu];
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       update_rq_clock(rq);
-       update_cfs_load(cfs_rq, 1);
-
-       /*
-        * We need to update shares after updating tg->load_weight in
-        * order to adjust the weight of groups with long running tasks.
-        */
-       update_cfs_shares(cfs_rq);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-
-       return 0;
-}
-
-static void update_shares(int cpu)
-{
-       struct cfs_rq *cfs_rq;
-       struct rq *rq = cpu_rq(cpu);
-
-       rcu_read_lock();
-       /*
-        * Iterates the task_group tree in a bottom up fashion, see
-        * list_add_leaf_cfs_rq() for details.
-        */
-       for_each_leaf_cfs_rq(rq, cfs_rq) {
-               /* throttled entities do not contribute to load */
-               if (throttled_hierarchy(cfs_rq))
-                       continue;
-
-               update_shares_cpu(cfs_rq->tg, cpu);
-       }
-       rcu_read_unlock();
-}
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-       unsigned long load;
-       long cpu = (long)data;
-
-       if (!tg->parent) {
-               load = cpu_rq(cpu)->load.weight;
-       } else {
-               load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->se[cpu]->load.weight;
-               load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-       }
-
-       tg->cfs_rq[cpu]->h_load = load;
-
-       return 0;
-}
-
-static void update_h_load(long cpu)
-{
-       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 unsigned long max_load_move,
-                 struct sched_domain *sd, enum cpu_idle_type idle,
-                 int *all_pinned)
-{
-       long rem_load_move = max_load_move;
-       struct cfs_rq *busiest_cfs_rq;
-
-       rcu_read_lock();
-       update_h_load(cpu_of(busiest));
-
-       for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
-               unsigned long busiest_h_load = busiest_cfs_rq->h_load;
-               unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-               u64 rem_load, moved_load;
-
-               /*
-                * empty group or part of a throttled hierarchy
-                */
-               if (!busiest_cfs_rq->task_weight ||
-                   throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
-                       continue;
-
-               rem_load = (u64)rem_load_move * busiest_weight;
-               rem_load = div_u64(rem_load, busiest_h_load + 1);
-
-               moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                               rem_load, sd, idle, all_pinned,
-                               busiest_cfs_rq);
-
-               if (!moved_load)
-                       continue;
-
-               moved_load *= busiest_h_load;
-               moved_load = div_u64(moved_load, busiest_weight + 1);
-
-               rem_load_move -= moved_load;
-               if (rem_load_move < 0)
-                       break;
-       }
-       rcu_read_unlock();
-
-       return max_load_move - rem_load_move;
-}
-#else
-static inline void update_shares(int cpu)
-{
-}
-
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                 unsigned long max_load_move,
-                 struct sched_domain *sd, enum cpu_idle_type idle,
-                 int *all_pinned)
-{
-       return balance_tasks(this_rq, this_cpu, busiest,
-                       max_load_move, sd, idle, all_pinned,
-                       &busiest->cfs);
-}
-#endif
-
-/*
- * move_tasks tries to move up to max_load_move weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                     unsigned long max_load_move,
-                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
-{
-       unsigned long total_load_moved = 0, load_moved;
-
-       do {
-               load_moved = load_balance_fair(this_rq, this_cpu, busiest,
-                               max_load_move - total_load_moved,
-                               sd, idle, all_pinned);
-
-               total_load_moved += load_moved;
-
-#ifdef CONFIG_PREEMPT
-               /*
-                * NEWIDLE balancing is a source of latency, so preemptible
-                * kernels will stop after the first task is pulled to minimize
-                * the critical section.
-                */
-               if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
-                       break;
-
-               if (raw_spin_is_contended(&this_rq->lock) ||
-                               raw_spin_is_contended(&busiest->lock))
-                       break;
-#endif
-       } while (load_moved && max_load_move > total_load_moved);
-
-       return total_load_moved > 0;
-}
-
-/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *             during load balancing.
- */
-struct sd_lb_stats {
-       struct sched_group *busiest; /* Busiest group in this sd */
-       struct sched_group *this;  /* Local group in this sd */
-       unsigned long total_load;  /* Total load of all groups in sd */
-       unsigned long total_pwr;   /*   Total power of all groups in sd */
-       unsigned long avg_load;    /* Average load across all groups in sd */
-
-       /** Statistics of this group */
-       unsigned long this_load;
-       unsigned long this_load_per_task;
-       unsigned long this_nr_running;
-       unsigned long this_has_capacity;
-       unsigned int  this_idle_cpus;
-
-       /* Statistics of the busiest group */
-       unsigned int  busiest_idle_cpus;
-       unsigned long max_load;
-       unsigned long busiest_load_per_task;
-       unsigned long busiest_nr_running;
-       unsigned long busiest_group_capacity;
-       unsigned long busiest_has_capacity;
-       unsigned int  busiest_group_weight;
-
-       int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-       int power_savings_balance; /* Is powersave balance needed for this sd */
-       struct sched_group *group_min; /* Least loaded group in sd */
-       struct sched_group *group_leader; /* Group which relieves group_min */
-       unsigned long min_load_per_task; /* load_per_task in group_min */
-       unsigned long leader_nr_running; /* Nr running of group_leader */
-       unsigned long min_nr_running; /* Nr running of group_min */
-#endif
-};
-
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
-       unsigned long avg_load; /*Avg load across the CPUs of the group */
-       unsigned long group_load; /* Total load over the CPUs of the group */
-       unsigned long sum_nr_running; /* Nr tasks running in the group */
-       unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-       unsigned long group_capacity;
-       unsigned long idle_cpus;
-       unsigned long group_weight;
-       int group_imb; /* Is there an imbalance in the group ? */
-       int group_has_capacity; /* Is there extra capacity in the group? */
-};
-
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-       return cpumask_first(sched_group_cpus(group));
-}
-
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                       enum cpu_idle_type idle)
-{
-       int load_idx;
-
-       switch (idle) {
-       case CPU_NOT_IDLE:
-               load_idx = sd->busy_idx;
-               break;
-
-       case CPU_NEWLY_IDLE:
-               load_idx = sd->newidle_idx;
-               break;
-       default:
-               load_idx = sd->idle_idx;
-               break;
-       }
-
-       return load_idx;
-}
-
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-       struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-       /*
-        * Busy processors will not participate in power savings
-        * balance.
-        */
-       if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-               sds->power_savings_balance = 0;
-       else {
-               sds->power_savings_balance = 1;
-               sds->min_nr_running = ULONG_MAX;
-               sds->leader_nr_running = 0;
-       }
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *             load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
-       if (!sds->power_savings_balance)
-               return;
-
-       /*
-        * If the local group is idle or completely loaded
-        * no need to do power savings balance at this domain
-        */
-       if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                               !sds->this_nr_running))
-               sds->power_savings_balance = 0;
-
-       /*
-        * If a group is already running at full capacity or idle,
-        * don't include that group in power savings calculations
-        */
-       if (!sds->power_savings_balance ||
-               sgs->sum_nr_running >= sgs->group_capacity ||
-               !sgs->sum_nr_running)
-               return;
-
-       /*
-        * Calculate the group which has the least non-idle load.
-        * This is the group from where we need to pick up the load
-        * for saving power
-        */
-       if ((sgs->sum_nr_running < sds->min_nr_running) ||
-           (sgs->sum_nr_running == sds->min_nr_running &&
-            group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-               sds->group_min = group;
-               sds->min_nr_running = sgs->sum_nr_running;
-               sds->min_load_per_task = sgs->sum_weighted_load /
-                                               sgs->sum_nr_running;
-       }
-
-       /*
-        * Calculate the group which is almost near its
-        * capacity but still has some space to pick up some load
-        * from other group and save more power
-        */
-       if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-               return;
-
-       if (sgs->sum_nr_running > sds->leader_nr_running ||
-           (sgs->sum_nr_running == sds->leader_nr_running &&
-            group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-               sds->group_leader = group;
-               sds->leader_nr_running = sgs->sum_nr_running;
-       }
-}
-
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *     under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                       int this_cpu, unsigned long *imbalance)
-{
-       if (!sds->power_savings_balance)
-               return 0;
-
-       if (sds->this != sds->group_leader ||
-                       sds->group_leader == sds->group_min)
-               return 0;
-
-       *imbalance = sds->min_load_per_task;
-       sds->busiest = sds->group_min;
-
-       return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-       struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-       return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-       struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-       return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                       int this_cpu, unsigned long *imbalance)
-{
-       return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-       return SCHED_POWER_SCALE;
-}
-
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
-{
-       return default_scale_freq_power(sd, cpu);
-}
-
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-       unsigned long weight = sd->span_weight;
-       unsigned long smt_gain = sd->smt_gain;
-
-       smt_gain /= weight;
-
-       return smt_gain;
-}
-
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
-{
-       return default_scale_smt_power(sd, cpu);
-}
-
-unsigned long scale_rt_power(int cpu)
-{
-       struct rq *rq = cpu_rq(cpu);
-       u64 total, available;
-
-       total = sched_avg_period() + (rq->clock - rq->age_stamp);
-
-       if (unlikely(total < rq->rt_avg)) {
-               /* Ensures that power won't end up being negative */
-               available = 0;
-       } else {
-               available = total - rq->rt_avg;
-       }
-
-       if (unlikely((s64)total < SCHED_POWER_SCALE))
-               total = SCHED_POWER_SCALE;
-
-       total >>= SCHED_POWER_SHIFT;
-
-       return div_u64(available, total);
-}
-
-static void update_cpu_power(struct sched_domain *sd, int cpu)
-{
-       unsigned long weight = sd->span_weight;
-       unsigned long power = SCHED_POWER_SCALE;
-       struct sched_group *sdg = sd->groups;
-
-       if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-               if (sched_feat(ARCH_POWER))
-                       power *= arch_scale_smt_power(sd, cpu);
-               else
-                       power *= default_scale_smt_power(sd, cpu);
-
-               power >>= SCHED_POWER_SHIFT;
-       }
-
-       sdg->sgp->power_orig = power;
-
-       if (sched_feat(ARCH_POWER))
-               power *= arch_scale_freq_power(sd, cpu);
-       else
-               power *= default_scale_freq_power(sd, cpu);
-
-       power >>= SCHED_POWER_SHIFT;
-
-       power *= scale_rt_power(cpu);
-       power >>= SCHED_POWER_SHIFT;
-
-       if (!power)
-               power = 1;
-
-       cpu_rq(cpu)->cpu_power = power;
-       sdg->sgp->power = power;
-}
-
-static void update_group_power(struct sched_domain *sd, int cpu)
-{
-       struct sched_domain *child = sd->child;
-       struct sched_group *group, *sdg = sd->groups;
-       unsigned long power;
-
-       if (!child) {
-               update_cpu_power(sd, cpu);
-               return;
-       }
-
-       power = 0;
-
-       group = child->groups;
-       do {
-               power += group->sgp->power;
-               group = group->next;
-       } while (group != child->groups);
-
-       sdg->sgp->power = power;
-}
-
-/*
- * Try and fix up capacity for tiny siblings, this is needed when
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
- * which on its own isn't powerful enough.
- *
- * See update_sd_pick_busiest() and check_asym_packing().
- */
-static inline int
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
-{
-       /*
-        * Only siblings can have significantly less than SCHED_POWER_SCALE
-        */
-       if (!(sd->flags & SD_SHARE_CPUPOWER))
-               return 0;
-
-       /*
-        * If ~90% of the cpu_power is still there, we're good.
-        */
-       if (group->sgp->power * 32 > group->sgp->power_orig * 29)
-               return 1;
-
-       return 0;
-}
-
-/**
- * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
- * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sgs: variable to hold the statistics for this group.
- */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
-                       struct sched_group *group, int this_cpu,
-                       enum cpu_idle_type idle, int load_idx,
-                       int local_group, const struct cpumask *cpus,
-                       int *balance, struct sg_lb_stats *sgs)
-{
-       unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
-       int i;
-       unsigned int balance_cpu = -1, first_idle_cpu = 0;
-       unsigned long avg_load_per_task = 0;
-
-       if (local_group)
-               balance_cpu = group_first_cpu(group);
-
-       /* Tally up the load of all CPUs in the group */
-       max_cpu_load = 0;
-       min_cpu_load = ~0UL;
-       max_nr_running = 0;
-
-       for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-               struct rq *rq = cpu_rq(i);
-
-               /* Bias balancing toward cpus of our domain */
-               if (local_group) {
-                       if (idle_cpu(i) && !first_idle_cpu) {
-                               first_idle_cpu = 1;
-                               balance_cpu = i;
-                       }
-
-                       load = target_load(i, load_idx);
-               } else {
-                       load = source_load(i, load_idx);
-                       if (load > max_cpu_load) {
-                               max_cpu_load = load;
-                               max_nr_running = rq->nr_running;
-                       }
-                       if (min_cpu_load > load)
-                               min_cpu_load = load;
-               }
-
-               sgs->group_load += load;
-               sgs->sum_nr_running += rq->nr_running;
-               sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
-                       sgs->idle_cpus++;
-       }
-
-       /*
-        * First idle cpu or the first cpu(busiest) in this sched group
-        * is eligible for doing load balancing at this and above
-        * domains. In the newly idle case, we will allow all the cpu's
-        * to do the newly idle load balance.
-        */
-       if (idle != CPU_NEWLY_IDLE && local_group) {
-               if (balance_cpu != this_cpu) {
-                       *balance = 0;
-                       return;
-               }
-               update_group_power(sd, this_cpu);
-       }
-
-       /* Adjust by relative CPU power of the group */
-       sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
-
-       /*
-        * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of a task.
-        *
-        * APZ: with cgroup the avg task weight can vary wildly and
-        *      might not be a suitable number - should we keep a
-        *      normalized nr_running number somewhere that negates
-        *      the hierarchy?
-        */
-       if (sgs->sum_nr_running)
-               avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-
-       if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
-               sgs->group_imb = 1;
-
-       sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
-                                               SCHED_POWER_SCALE);
-       if (!sgs->group_capacity)
-               sgs->group_capacity = fix_small_capacity(sd, group);
-       sgs->group_weight = group->group_weight;
-
-       if (sgs->group_capacity > sgs->sum_nr_running)
-               sgs->group_has_capacity = 1;
-}
-
-/**
- * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
- * @sds: sched_domain statistics
- * @sg: sched_group candidate to be checked for being the busiest
- * @sgs: sched_group statistics
- * @this_cpu: the current cpu
- *
- * Determine if @sg is a busier group than the previously selected
- * busiest group.
- */
-static bool update_sd_pick_busiest(struct sched_domain *sd,
-                                  struct sd_lb_stats *sds,
-                                  struct sched_group *sg,
-                                  struct sg_lb_stats *sgs,
-                                  int this_cpu)
-{
-       if (sgs->avg_load <= sds->max_load)
-               return false;
-
-       if (sgs->sum_nr_running > sgs->group_capacity)
-               return true;
-
-       if (sgs->group_imb)
-               return true;
-
-       /*
-        * ASYM_PACKING needs to move all the work to the lowest
-        * numbered CPUs in the group, therefore mark all groups
-        * higher than ourself as busy.
-        */
-       if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
-           this_cpu < group_first_cpu(sg)) {
-               if (!sds->busiest)
-                       return true;
-
-               if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
-                       return true;
-       }
-
-       return false;
-}
-
-/**
- * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
- * @cpus: Set of cpus considered for load balancing.
- * @balance: Should we balance.
- * @sds: variable to hold the statistics for this sched_domain.
- */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
-                       enum cpu_idle_type idle, const struct cpumask *cpus,
-                       int *balance, struct sd_lb_stats *sds)
-{
-       struct sched_domain *child = sd->child;
-       struct sched_group *sg = sd->groups;
-       struct sg_lb_stats sgs;
-       int load_idx, prefer_sibling = 0;
-
-       if (child && child->flags & SD_PREFER_SIBLING)
-               prefer_sibling = 1;
-
-       init_sd_power_savings_stats(sd, sds, idle);
-       load_idx = get_sd_load_idx(sd, idle);
-
-       do {
-               int local_group;
-
-               local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
-               memset(&sgs, 0, sizeof(sgs));
-               update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
-                               local_group, cpus, balance, &sgs);
-
-               if (local_group && !(*balance))
-                       return;
-
-               sds->total_load += sgs.group_load;
-               sds->total_pwr += sg->sgp->power;
-
-               /*
-                * In case the child domain prefers tasks go to siblings
-                * first, lower the sg capacity to one so that we'll try
-                * and move all the excess tasks away. We lower the capacity
-                * of a group only if the local group has the capacity to fit
-                * these excess tasks, i.e. nr_running < group_capacity. The
-                * extra check prevents the case where you always pull from the
-                * heaviest group when it is already under-utilized (possible
-                * with a large weight task outweighs the tasks on the system).
-                */
-               if (prefer_sibling && !local_group && sds->this_has_capacity)
-                       sgs.group_capacity = min(sgs.group_capacity, 1UL);
-
-               if (local_group) {
-                       sds->this_load = sgs.avg_load;
-                       sds->this = sg;
-                       sds->this_nr_running = sgs.sum_nr_running;
-                       sds->this_load_per_task = sgs.sum_weighted_load;
-                       sds->this_has_capacity = sgs.group_has_capacity;
-                       sds->this_idle_cpus = sgs.idle_cpus;
-               } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
-                       sds->max_load = sgs.avg_load;
-                       sds->busiest = sg;
-                       sds->busiest_nr_running = sgs.sum_nr_running;
-                       sds->busiest_idle_cpus = sgs.idle_cpus;
-                       sds->busiest_group_capacity = sgs.group_capacity;
-                       sds->busiest_load_per_task = sgs.sum_weighted_load;
-                       sds->busiest_has_capacity = sgs.group_has_capacity;
-                       sds->busiest_group_weight = sgs.group_weight;
-                       sds->group_imb = sgs.group_imb;
-               }
-
-               update_sd_power_savings_stats(sg, sds, local_group, &sgs);
-               sg = sg->next;
-       } while (sg != sd->groups);
-}
-
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
-
-/**
- * check_asym_packing - Check to see if the group is packed into the
- *                     sched doman.
- *
- * This is primarily intended to used at the sibling level.  Some
- * cores like POWER7 prefer to use lower numbered SMT threads.  In the
- * case of POWER7, it can move to lower SMT modes only when higher
- * threads are idle.  When in lower SMT modes, the threads will
- * perform better since they share less core resources.  Hence when we
- * have idle threads, we want them to be the higher ones.
- *
- * This packing function is run on idle threads.  It checks to see if
- * the busiest CPU in this domain (core in the P7 case) has a higher
- * CPU number than the packing function is being run on.  Here we are
- * assuming lower CPU number will be equivalent to lower a SMT thread
- * number.
- *
- * Returns 1 when packing is required and a task should be moved to
- * this CPU.  The amount of the imbalance is returned in *imbalance.
- *
- * @sd: The sched_domain whose packing is to be checked.
- * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
- */
-static int check_asym_packing(struct sched_domain *sd,
-                             struct sd_lb_stats *sds,
-                             int this_cpu, unsigned long *imbalance)
-{
-       int busiest_cpu;
-
-       if (!(sd->flags & SD_ASYM_PACKING))
-               return 0;
-
-       if (!sds->busiest)
-               return 0;
-
-       busiest_cpu = group_first_cpu(sds->busiest);
-       if (this_cpu > busiest_cpu)
-               return 0;
-
-       *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
-                                      SCHED_POWER_SCALE);
-       return 1;
-}
-
-/**
- * fix_small_imbalance - Calculate the minor imbalance that exists
- *                     amongst the groups of a sched_domain, during
- *                     load balancing.
- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
- */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
-                               int this_cpu, unsigned long *imbalance)
-{
-       unsigned long tmp, pwr_now = 0, pwr_move = 0;
-       unsigned int imbn = 2;
-       unsigned long scaled_busy_load_per_task;
-
-       if (sds->this_nr_running) {
-               sds->this_load_per_task /= sds->this_nr_running;
-               if (sds->busiest_load_per_task >
-                               sds->this_load_per_task)
-                       imbn = 1;
-       } else
-               sds->this_load_per_task =
-                       cpu_avg_load_per_task(this_cpu);
-
-       scaled_busy_load_per_task = sds->busiest_load_per_task
-                                        * SCHED_POWER_SCALE;
-       scaled_busy_load_per_task /= sds->busiest->sgp->power;
-
-       if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
-                       (scaled_busy_load_per_task * imbn)) {
-               *imbalance = sds->busiest_load_per_task;
-               return;
-       }
-
-       /*
-        * OK, we don't have enough imbalance to justify moving tasks,
-        * however we may be able to increase total CPU power used by
-        * moving them.
-        */
-
-       pwr_now += sds->busiest->sgp->power *
-                       min(sds->busiest_load_per_task, sds->max_load);
-       pwr_now += sds->this->sgp->power *
-                       min(sds->this_load_per_task, sds->this_load);
-       pwr_now /= SCHED_POWER_SCALE;
-
-       /* Amount of load we'd subtract */
-       tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-               sds->busiest->sgp->power;
-       if (sds->max_load > tmp)
-               pwr_move += sds->busiest->sgp->power *
-                       min(sds->busiest_load_per_task, sds->max_load - tmp);
-
-       /* Amount of load we'd add */
-       if (sds->max_load * sds->busiest->sgp->power <
-               sds->busiest_load_per_task * SCHED_POWER_SCALE)
-               tmp = (sds->max_load * sds->busiest->sgp->power) /
-                       sds->this->sgp->power;
-       else
-               tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
-                       sds->this->sgp->power;
-       pwr_move += sds->this->sgp->power *
-                       min(sds->this_load_per_task, sds->this_load + tmp);
-       pwr_move /= SCHED_POWER_SCALE;
-
-       /* Move if we gain throughput */
-       if (pwr_move > pwr_now)
-               *imbalance = sds->busiest_load_per_task;
-}
-
-/**
- * calculate_imbalance - Calculate the amount of imbalance present within the
- *                      groups of a given sched_domain during load balance.
- * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
- */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
-               unsigned long *imbalance)
-{
-       unsigned long max_pull, load_above_capacity = ~0UL;
-
-       sds->busiest_load_per_task /= sds->busiest_nr_running;
-       if (sds->group_imb) {
-               sds->busiest_load_per_task =
-                       min(sds->busiest_load_per_task, sds->avg_load);
-       }
-
-       /*
-        * In the presence of smp nice balancing, certain scenarios can have
-        * max load less than avg load(as we skip the groups at or below
-        * its cpu_power, while calculating max_load..)
-        */
-       if (sds->max_load < sds->avg_load) {
-               *imbalance = 0;
-               return fix_small_imbalance(sds, this_cpu, imbalance);
-       }
-
-       if (!sds->group_imb) {
-               /*
-                * Don't want to pull so many tasks that a group would go idle.
-                */
-               load_above_capacity = (sds->busiest_nr_running -
-                                               sds->busiest_group_capacity);
-
-               load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
-
-               load_above_capacity /= sds->busiest->sgp->power;
-       }
-
-       /*
-        * We're trying to get all the cpus to the average_load, so we don't
-        * want to push ourselves above the average load, nor do we wish to
-        * reduce the max loaded cpu below the average load. At the same time,
-        * we also don't want to reduce the group load below the group capacity
-        * (so that we can implement power-savings policies etc). Thus we look
-        * for the minimum possible imbalance.
-        * Be careful of negative numbers as they'll appear as very large values
-        * with unsigned longs.
-        */
-       max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
-
-       /* How much load to actually move to equalise the imbalance */
-       *imbalance = min(max_pull * sds->busiest->sgp->power,
-               (sds->avg_load - sds->this_load) * sds->this->sgp->power)
-                       / SCHED_POWER_SCALE;
-
-       /*
-        * if *imbalance is less than the average load per runnable task
-        * there is no guarantee that any tasks will be moved so we'll have
-        * a think about bumping its value to force at least one task to be
-        * moved
-        */
-       if (*imbalance < sds->busiest_load_per_task)
-               return fix_small_imbalance(sds, this_cpu, imbalance);
-
-}
-
-/******* find_busiest_group() helpers end here *********************/
-
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
- *
- * Also calculates the amount of weighted load which should be moved
- * to restore balance.
- *
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *             be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
- * @cpus: The set of CPUs under consideration for load-balancing.
- * @balance: Pointer to a variable indicating if this_cpu
- *     is the appropriate cpu to perform load balancing at this_level.
- *
- * Returns:    - the busiest group if imbalance exists.
- *             - If no imbalance and user has opted for power-savings balance,
- *                return the least loaded group whose CPUs can be
- *                put to idle by rebalancing its tasks onto our group.
- */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum cpu_idle_type idle,
-                  const struct cpumask *cpus, int *balance)
-{
-       struct sd_lb_stats sds;
-
-       memset(&sds, 0, sizeof(sds));
-
-       /*
-        * Compute the various statistics relavent for load balancing at
-        * this level.
-        */
-       update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
-
-       /*
-        * this_cpu is not the appropriate cpu to perform load balancing at
-        * this level.
-        */
-       if (!(*balance))
-               goto ret;
-
-       if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
-           check_asym_packing(sd, &sds, this_cpu, imbalance))
-               return sds.busiest;
-
-       /* There is no busy sibling group to pull tasks from */
-       if (!sds.busiest || sds.busiest_nr_running == 0)
-               goto out_balanced;
-
-       sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
-
-       /*
-        * If the busiest group is imbalanced the below checks don't
-        * work because they assumes all things are equal, which typically
-        * isn't true due to cpus_allowed constraints and the like.
-        */
-       if (sds.group_imb)
-               goto force_balance;
-
-       /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-       if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
-                       !sds.busiest_has_capacity)
-               goto force_balance;
-
-       /*
-        * If the local group is more busy than the selected busiest group
-        * don't try and pull any tasks.
-        */
-       if (sds.this_load >= sds.max_load)
-               goto out_balanced;
-
-       /*
-        * Don't pull any tasks if this group is already above the domain
-        * average load.
-        */
-       if (sds.this_load >= sds.avg_load)
-               goto out_balanced;
-
-       if (idle == CPU_IDLE) {
-               /*
-                * This cpu is idle. If the busiest group load doesn't
-                * have more tasks than the number of available cpu's and
-                * there is no imbalance between this and busiest group
-                * wrt to idle cpu's, it is balanced.
-                */
-               if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
-                   sds.busiest_nr_running <= sds.busiest_group_weight)
-                       goto out_balanced;
-       } else {
-               /*
-                * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
-                * imbalance_pct to be conservative.
-                */
-               if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                       goto out_balanced;
-       }
-
-force_balance:
-       /* Looks like there is an imbalance. Compute it */
-       calculate_imbalance(&sds, this_cpu, imbalance);
-       return sds.busiest;
-
-out_balanced:
-       /*
-        * There is no obvious imbalance. But check if we can do some balancing
-        * to save power.
-        */
-       if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-               return sds.busiest;
-ret:
-       *imbalance = 0;
-       return NULL;
-}
-
-/*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
- */
-static struct rq *
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
-                  enum cpu_idle_type idle, unsigned long imbalance,
-                  const struct cpumask *cpus)
-{
-       struct rq *busiest = NULL, *rq;
-       unsigned long max_load = 0;
-       int i;
-
-       for_each_cpu(i, sched_group_cpus(group)) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
-
-               if (!capacity)
-                       capacity = fix_small_capacity(sd, group);
-
-               if (!cpumask_test_cpu(i, cpus))
-                       continue;
-
-               rq = cpu_rq(i);
-               wl = weighted_cpuload(i);
-
-               /*
-                * When comparing with imbalance, use weighted_cpuload()
-                * which is not scaled with the cpu power.
-                */
-               if (capacity && rq->nr_running == 1 && wl > imbalance)
-                       continue;
-
-               /*
-                * For the load comparisons with the other cpu's, consider
-                * the weighted_cpuload() scaled with the cpu power, so that
-                * the load can be moved away from the cpu that is potentially
-                * running at a lower capacity.
-                */
-               wl = (wl * SCHED_POWER_SCALE) / power;
-
-               if (wl > max_load) {
-                       max_load = wl;
-                       busiest = rq;
-               }
-       }
-
-       return busiest;
-}
-
-/*
- * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
- * so long as it is large enough.
- */
-#define MAX_PINNED_INTERVAL    512
-
-/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-
-static int need_active_balance(struct sched_domain *sd, int idle,
-                              int busiest_cpu, int this_cpu)
-{
-       if (idle == CPU_NEWLY_IDLE) {
-
-               /*
-                * ASYM_PACKING needs to force migrate tasks from busy but
-                * higher numbered CPUs in order to pack all tasks in the
-                * lowest numbered CPUs.
-                */
-               if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
-                       return 1;
-
-               /*
-                * The only task running in a non-idle cpu can be moved to this
-                * cpu in an attempt to completely freeup the other CPU
-                * package.
-                *
-                * The package power saving logic comes from
-                * find_busiest_group(). If there are no imbalance, then
-                * f_b_g() will return NULL. However when sched_mc={1,2} then
-                * f_b_g() will select a group from which a running task may be
-                * pulled to this cpu in order to make the other package idle.
-                * If there is no opportunity to make a package idle and if
-                * there are no imbalance, then f_b_g() will return NULL and no
-                * action will be taken in load_balance_newidle().
-                *
-                * Under normal task pull operation due to imbalance, there
-                * will be more than one task in the source run queue and
-                * move_tasks() will succeed.  ld_moved will be true and this
-                * active balance code will not be triggered.
-                */
-               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                       return 0;
-       }
-
-       return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
-static int active_load_balance_cpu_stop(void *data);
-
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
-                       struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance)
-{
-       int ld_moved, all_pinned = 0, active_balance = 0;
-       struct sched_group *group;
-       unsigned long imbalance;
-       struct rq *busiest;
-       unsigned long flags;
-       struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
-
-       cpumask_copy(cpus, cpu_active_mask);
-
-       schedstat_inc(sd, lb_count[idle]);
-
-redo:
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle,
-                                  cpus, balance);
-
-       if (*balance == 0)
-               goto out_balanced;
-
-       if (!group) {
-               schedstat_inc(sd, lb_nobusyg[idle]);
-               goto out_balanced;
-       }
-
-       busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
-       if (!busiest) {
-               schedstat_inc(sd, lb_nobusyq[idle]);
-               goto out_balanced;
-       }
-
-       BUG_ON(busiest == this_rq);
-
-       schedstat_add(sd, lb_imbalance[idle], imbalance);
-
-       ld_moved = 0;
-       if (busiest->nr_running > 1) {
-               /*
-                * Attempt to move tasks. If find_busiest_group has found
-                * an imbalance but busiest->nr_running <= 1, the group is
-                * still unbalanced. ld_moved simply stays zero, so it is
-                * correctly treated as an imbalance.
-                */
-               all_pinned = 1;
-               local_irq_save(flags);
-               double_rq_lock(this_rq, busiest);
-               ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                     imbalance, sd, idle, &all_pinned);
-               double_rq_unlock(this_rq, busiest);
-               local_irq_restore(flags);
-
-               /*
-                * some other cpu did the load balance for us.
-                */
-               if (ld_moved && this_cpu != smp_processor_id())
-                       resched_cpu(this_cpu);
-
-               /* All tasks on this runqueue were pinned by CPU affinity */
-               if (unlikely(all_pinned)) {
-                       cpumask_clear_cpu(cpu_of(busiest), cpus);
-                       if (!cpumask_empty(cpus))
-                               goto redo;
-                       goto out_balanced;
-               }
-       }
-
-       if (!ld_moved) {
-               schedstat_inc(sd, lb_failed[idle]);
-               /*
-                * Increment the failure counter only on periodic balance.
-                * We do not want newidle balance, which can be very
-                * frequent, pollute the failure counter causing
-                * excessive cache_hot migrations and active balances.
-                */
-               if (idle != CPU_NEWLY_IDLE)
-                       sd->nr_balance_failed++;
-
-               if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
-                       raw_spin_lock_irqsave(&busiest->lock, flags);
-
-                       /* don't kick the active_load_balance_cpu_stop,
-                        * if the curr task on busiest cpu can't be
-                        * moved to this_cpu
-                        */
-                       if (!cpumask_test_cpu(this_cpu,
-                                       tsk_cpus_allowed(busiest->curr))) {
-                               raw_spin_unlock_irqrestore(&busiest->lock,
-                                                           flags);
-                               all_pinned = 1;
-                               goto out_one_pinned;
-                       }
-
-                       /*
-                        * ->active_balance synchronizes accesses to
-                        * ->active_balance_work.  Once set, it's cleared
-                        * only after active load balance is finished.
-                        */
-                       if (!busiest->active_balance) {
-                               busiest->active_balance = 1;
-                               busiest->push_cpu = this_cpu;
-                               active_balance = 1;
-                       }
-                       raw_spin_unlock_irqrestore(&busiest->lock, flags);
-
-                       if (active_balance)
-                               stop_one_cpu_nowait(cpu_of(busiest),
-                                       active_load_balance_cpu_stop, busiest,
-                                       &busiest->active_balance_work);
-
-                       /*
-                        * We've kicked active balancing, reset the failure
-                        * counter.
-                        */
-                       sd->nr_balance_failed = sd->cache_nice_tries+1;
-               }
-       } else
-               sd->nr_balance_failed = 0;
-
-       if (likely(!active_balance)) {
-               /* We were unbalanced, so reset the balancing interval */
-               sd->balance_interval = sd->min_interval;
-       } else {
-               /*
-                * If we've begun active balancing, start to back off. This
-                * case may not be covered by the all_pinned logic if there
-                * is only 1 task on the busy runqueue (because we don't call
-                * move_tasks).
-                */
-               if (sd->balance_interval < sd->max_interval)
-                       sd->balance_interval *= 2;
-       }
-
-       goto out;
-
-out_balanced:
-       schedstat_inc(sd, lb_balanced[idle]);
-
-       sd->nr_balance_failed = 0;
-
-out_one_pinned:
-       /* tune up the balancing interval */
-       if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
-                       (sd->balance_interval < sd->max_interval))
-               sd->balance_interval *= 2;
-
-       ld_moved = 0;
-out:
-       return ld_moved;
-}
-
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static void idle_balance(int this_cpu, struct rq *this_rq)
-{
-       struct sched_domain *sd;
-       int pulled_task = 0;
-       unsigned long next_balance = jiffies + HZ;
-
-       this_rq->idle_stamp = this_rq->clock;
-
-       if (this_rq->avg_idle < sysctl_sched_migration_cost)
-               return;
-
-       /*
-        * Drop the rq->lock, but keep IRQ/preempt disabled.
-        */
-       raw_spin_unlock(&this_rq->lock);
-
-       update_shares(this_cpu);
-       rcu_read_lock();
-       for_each_domain(this_cpu, sd) {
-               unsigned long interval;
-               int balance = 1;
-
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
-               if (sd->flags & SD_BALANCE_NEWIDLE) {
-                       /* If we've pulled tasks over stop searching: */
-                       pulled_task = load_balance(this_cpu, this_rq,
-                                                  sd, CPU_NEWLY_IDLE, &balance);
-               }
-
-               interval = msecs_to_jiffies(sd->balance_interval);
-               if (time_after(next_balance, sd->last_balance + interval))
-                       next_balance = sd->last_balance + interval;
-               if (pulled_task) {
-                       this_rq->idle_stamp = 0;
-                       break;
-               }
-       }
-       rcu_read_unlock();
-
-       raw_spin_lock(&this_rq->lock);
-
-       if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
-               /*
-                * We are going idle. next_balance may be set based on
-                * a busy processor. So reset next_balance.
-                */
-               this_rq->next_balance = next_balance;
-       }
-}
-
-/*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
- * running tasks off the busiest CPU onto idle CPUs. It requires at
- * least 1 task to be running on each physical CPU where possible, and
- * avoids physical / logical imbalances.
- */
-static int active_load_balance_cpu_stop(void *data)
-{
-       struct rq *busiest_rq = data;
-       int busiest_cpu = cpu_of(busiest_rq);
-       int target_cpu = busiest_rq->push_cpu;
-       struct rq *target_rq = cpu_rq(target_cpu);
-       struct sched_domain *sd;
-
-       raw_spin_lock_irq(&busiest_rq->lock);
-
-       /* make sure the requested cpu hasn't gone down in the meantime */
-       if (unlikely(busiest_cpu != smp_processor_id() ||
-                    !busiest_rq->active_balance))
-               goto out_unlock;
-
-       /* Is there any task to move? */
-       if (busiest_rq->nr_running <= 1)
-               goto out_unlock;
-
-       /*
-        * This condition is "impossible", if it occurs
-        * we need to fix it. Originally reported by
-        * Bjorn Helgaas on a 128-cpu setup.
-        */
-       BUG_ON(busiest_rq == target_rq);
-
-       /* move a task from busiest_rq to target_rq */
-       double_lock_balance(busiest_rq, target_rq);
-
-       /* Search for an sd spanning us and the target CPU. */
-       rcu_read_lock();
-       for_each_domain(target_cpu, sd) {
-               if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
-                               break;
-       }
-
-       if (likely(sd)) {
-               schedstat_inc(sd, alb_count);
-
-               if (move_one_task(target_rq, target_cpu, busiest_rq,
-                                 sd, CPU_IDLE))
-                       schedstat_inc(sd, alb_pushed);
-               else
-                       schedstat_inc(sd, alb_failed);
-       }
-       rcu_read_unlock();
-       double_unlock_balance(busiest_rq, target_rq);
-out_unlock:
-       busiest_rq->active_balance = 0;
-       raw_spin_unlock_irq(&busiest_rq->lock);
-       return 0;
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- *   entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- *   it is idle, just like all other idle CPUs
- * - When one of the busy CPUs notice that there may be an idle rebalancing
- *   needed, they will kick the idle load balancer, which then does idle
- *   load balancing for all the idle CPUs.
- */
-static struct {
-       atomic_t load_balancer;
-       atomic_t first_pick_cpu;
-       atomic_t second_pick_cpu;
-       cpumask_var_t idle_cpus_mask;
-       cpumask_var_t grp_idle_mask;
-       unsigned long next_balance;     /* in jiffy units */
-} nohz ____cacheline_aligned;
-
-int get_nohz_load_balancer(void)
-{
-       return atomic_read(&nohz.load_balancer);
-}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:       The cpu whose lowest level of sched domain is to
- *             be returned.
- * @flag:      The flag to check for the lowest sched_domain
- *             for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-       struct sched_domain *sd;
-
-       for_each_domain(cpu, sd)
-               if (sd->flags & flag)
-                       break;
-
-       return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:       The cpu whose domains we're iterating over.
- * @sd:                variable holding the value of the power_savings_sd
- *             for cpu.
- * @flag:      The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-       for (sd = lowest_flag_domain(cpu, flag); \
-               (sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group: group to be checked for semi-idleness
- *
- * Returns:    1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-       cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
-                                       sched_group_cpus(ilb_group));
-
-       /*
-        * A sched_group is semi-idle when it has atleast one busy cpu
-        * and atleast one idle cpu.
-        */
-       if (cpumask_empty(nohz.grp_idle_mask))
-               return 0;
-
-       if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
-               return 0;
-
-       return 1;
-}
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:       The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:    Returns the id of the idle load balancer if it exists,
- *             Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
-{
-       struct sched_domain *sd;
-       struct sched_group *ilb_group;
-       int ilb = nr_cpu_ids;
-
-       /*
-        * Have idle load balancer selection from semi-idle packages only
-        * when power-aware load balancing is enabled
-        */
-       if (!(sched_smt_power_savings || sched_mc_power_savings))
-               goto out_done;
-
-       /*
-        * Optimize for the case when we have no idle CPUs or only one
-        * idle CPU. Don't walk the sched_domain hierarchy in such cases
-        */
-       if (cpumask_weight(nohz.idle_cpus_mask) < 2)
-               goto out_done;
-
-       rcu_read_lock();
-       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-               ilb_group = sd->groups;
-
-               do {
-                       if (is_semi_idle_group(ilb_group)) {
-                               ilb = cpumask_first(nohz.grp_idle_mask);
-                               goto unlock;
-                       }
-
-                       ilb_group = ilb_group->next;
-
-               } while (ilb_group != sd->groups);
-       }
-unlock:
-       rcu_read_unlock();
-
-out_done:
-       return ilb;
-}
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-       return nr_cpu_ids;
-}
-#endif
-
-/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
- */
-static void nohz_balancer_kick(int cpu)
-{
-       int ilb_cpu;
-
-       nohz.next_balance++;
-
-       ilb_cpu = get_nohz_load_balancer();
-
-       if (ilb_cpu >= nr_cpu_ids) {
-               ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
-               if (ilb_cpu >= nr_cpu_ids)
-                       return;
-       }
-
-       if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-               cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-
-               smp_mb();
-               /*
-                * Use smp_send_reschedule() instead of resched_cpu().
-                * This way we generate a sched IPI on the target cpu which
-                * is idle. And the softirq performing nohz idle load balance
-                * will be run before returning from the IPI.
-                */
-               smp_send_reschedule(ilb_cpu);
-       }
-       return;
-}
-
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
- */
-void select_nohz_load_balancer(int stop_tick)
-{
-       int cpu = smp_processor_id();
-
-       if (stop_tick) {
-               if (!cpu_active(cpu)) {
-                       if (atomic_read(&nohz.load_balancer) != cpu)
-                               return;
-
-                       /*
-                        * If we are going offline and still the leader,
-                        * give up!
-                        */
-                       if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                          nr_cpu_ids) != cpu)
-                               BUG();
-
-                       return;
-               }
-
-               cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
-
-               if (atomic_read(&nohz.first_pick_cpu) == cpu)
-                       atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
-               if (atomic_read(&nohz.second_pick_cpu) == cpu)
-                       atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-
-               if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
-                       int new_ilb;
-
-                       /* make me the ilb owner */
-                       if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
-                                          cpu) != nr_cpu_ids)
-                               return;
-
-                       /*
-                        * Check to see if there is a more power-efficient
-                        * ilb.
-                        */
-                       new_ilb = find_new_ilb(cpu);
-                       if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                               atomic_set(&nohz.load_balancer, nr_cpu_ids);
-                               resched_cpu(new_ilb);
-                               return;
-                       }
-                       return;
-               }
-       } else {
-               if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
-                       return;
-
-               cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-
-               if (atomic_read(&nohz.load_balancer) == cpu)
-                       if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                          nr_cpu_ids) != cpu)
-                               BUG();
-       }
-       return;
-}
-#endif
-
-static DEFINE_SPINLOCK(balancing);
-
-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
-
-/*
- * Scale the max load_balance interval with the number of CPUs in the system.
- * This trades load-balance latency on larger machines for less cross talk.
- */
-static void update_max_interval(void)
-{
-       max_load_balance_interval = HZ*num_online_cpus()/10;
-}
-
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
-{
-       int balance = 1;
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long interval;
-       struct sched_domain *sd;
-       /* Earliest time when we have to do rebalance again */
-       unsigned long next_balance = jiffies + 60*HZ;
-       int update_next_balance = 0;
-       int need_serialize;
-
-       update_shares(cpu);
-
-       rcu_read_lock();
-       for_each_domain(cpu, sd) {
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       continue;
-
-               interval = sd->balance_interval;
-               if (idle != CPU_IDLE)
-                       interval *= sd->busy_factor;
-
-               /* scale ms to jiffies */
-               interval = msecs_to_jiffies(interval);
-               interval = clamp(interval, 1UL, max_load_balance_interval);
-
-               need_serialize = sd->flags & SD_SERIALIZE;
-
-               if (need_serialize) {
-                       if (!spin_trylock(&balancing))
-                               goto out;
-               }
-
-               if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance)) {
-                               /*
-                                * We've pulled tasks over so either we're no
-                                * longer idle.
-                                */
-                               idle = CPU_NOT_IDLE;
-                       }
-                       sd->last_balance = jiffies;
-               }
-               if (need_serialize)
-                       spin_unlock(&balancing);
-out:
-               if (time_after(next_balance, sd->last_balance + interval)) {
-                       next_balance = sd->last_balance + interval;
-                       update_next_balance = 1;
-               }
-
-               /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
-                */
-               if (!balance)
-                       break;
-       }
-       rcu_read_unlock();
-
-       /*
-        * next_balance will be updated only when there is a need.
-        * When the cpu is attached to null domain for ex, it will not be
-        * updated.
-        */
-       if (likely(update_next_balance))
-               rq->next_balance = next_balance;
-}
-
-#ifdef CONFIG_NO_HZ
-/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
-{
-       struct rq *this_rq = cpu_rq(this_cpu);
-       struct rq *rq;
-       int balance_cpu;
-
-       if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
-               return;
-
-       for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-               if (balance_cpu == this_cpu)
-                       continue;
-
-               /*
-                * If this cpu gets work to do, stop the load balancing
-                * work being done for other cpus. Next load
-                * balancing owner will pick it up.
-                */
-               if (need_resched()) {
-                       this_rq->nohz_balance_kick = 0;
-                       break;
-               }
-
-               raw_spin_lock_irq(&this_rq->lock);
-               update_rq_clock(this_rq);
-               update_cpu_load(this_rq);
-               raw_spin_unlock_irq(&this_rq->lock);
-
-               rebalance_domains(balance_cpu, CPU_IDLE);
-
-               rq = cpu_rq(balance_cpu);
-               if (time_after(this_rq->next_balance, rq->next_balance))
-                       this_rq->next_balance = rq->next_balance;
-       }
-       nohz.next_balance = this_rq->next_balance;
-       this_rq->nohz_balance_kick = 0;
-}
-
-/*
- * Current heuristic for kicking the idle load balancer
- * - first_pick_cpu is the one of the busy CPUs. It will kick
- *   idle load balancer when it has more than one process active. This
- *   eliminates the need for idle load balancing altogether when we have
- *   only one running process in the system (common case).
- * - If there are more than one busy CPU, idle load balancer may have
- *   to run for active_load_balance to happen (i.e., two busy CPUs are
- *   SMT or core siblings and can run better if they move to different
- *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- *   which will kick idle load balancer as soon as it has any load.
- */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
-{
-       unsigned long now = jiffies;
-       int ret;
-       int first_pick_cpu, second_pick_cpu;
-
-       if (time_before(now, nohz.next_balance))
-               return 0;
-
-       if (idle_cpu(cpu))
-               return 0;
-
-       first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
-       second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
-
-       if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
-           second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
-               return 0;
-
-       ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
-       if (ret == nr_cpu_ids || ret == cpu) {
-               atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-               if (rq->nr_running > 1)
-                       return 1;
-       } else {
-               ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
-               if (ret == nr_cpu_ids || ret == cpu) {
-                       if (rq->nr_running)
-                               return 1;
-               }
-       }
-       return 0;
-}
-#else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
-#endif
-
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
- */
-static void run_rebalance_domains(struct softirq_action *h)
-{
-       int this_cpu = smp_processor_id();
-       struct rq *this_rq = cpu_rq(this_cpu);
-       enum cpu_idle_type idle = this_rq->idle_balance ?
-                                               CPU_IDLE : CPU_NOT_IDLE;
-
-       rebalance_domains(this_cpu, idle);
-
-       /*
-        * If this cpu has a pending nohz_balance_kick, then do the
-        * balancing on behalf of the other idle cpus whose ticks are
-        * stopped.
-        */
-       nohz_idle_balance(this_cpu, idle);
-}
-
-static inline int on_null_domain(int cpu)
-{
-       return !rcu_dereference_sched(cpu_rq(cpu)->sd);
-}
-
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
-{
-       /* Don't need to rebalance while attached to NULL domain */
-       if (time_after_eq(jiffies, rq->next_balance) &&
-           likely(!on_null_domain(cpu)))
-               raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
-       else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
-               nohz_balancer_kick(cpu);
-#endif
-}
-
-static void rq_online_fair(struct rq *rq)
-{
-       update_sysctl();
-}
-
-static void rq_offline_fair(struct rq *rq)
-{
-       update_sysctl();
-}
-
-#else  /* CONFIG_SMP */
-
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
-
-#endif /* CONFIG_SMP */
-
-/*
- * scheduler tick hitting a task of our scheduling class:
- */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
-{
-       struct cfs_rq *cfs_rq;
-       struct sched_entity *se = &curr->se;
-
-       for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
-               entity_tick(cfs_rq, se, queued);
-       }
-}
-
-/*
- * called on fork with the child task as argument from the parent's context
- *  - child not yet on the tasklist
- *  - preemption disabled
- */
-static void task_fork_fair(struct task_struct *p)
-{
-       struct cfs_rq *cfs_rq = task_cfs_rq(current);
-       struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
-       int this_cpu = smp_processor_id();
-       struct rq *rq = this_rq();
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       update_rq_clock(rq);
-
-       if (unlikely(task_cpu(p) != this_cpu)) {
-               rcu_read_lock();
-               __set_task_cpu(p, this_cpu);
-               rcu_read_unlock();
-       }
-
-       update_curr(cfs_rq);
-
-       if (curr)
-               se->vruntime = curr->vruntime;
-       place_entity(cfs_rq, se, 1);
-
-       if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
-               /*
-                * Upon rescheduling, sched_class::put_prev_task() will place
-                * 'current' within the tree based on its new key value.
-                */
-               swap(curr->vruntime, se->vruntime);
-               resched_task(rq->curr);
-       }
-
-       se->vruntime -= cfs_rq->min_vruntime;
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-/*
- * Priority of the task has changed. Check to see if we preempt
- * the current task.
- */
-static void
-prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
-{
-       if (!p->se.on_rq)
-               return;
-
-       /*
-        * Reschedule if we are currently running on this runqueue and
-        * our priority decreased, or if we are not currently running on
-        * this runqueue and our priority is higher than the current's
-        */
-       if (rq->curr == p) {
-               if (p->prio > oldprio)
-                       resched_task(rq->curr);
-       } else
-               check_preempt_curr(rq, p, 0);
-}
-
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
-{
-       struct sched_entity *se = &p->se;
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-       /*
-        * Ensure the task's vruntime is normalized, so that when its
-        * switched back to the fair class the enqueue_entity(.flags=0) will
-        * do the right thing.
-        *
-        * If it was on_rq, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it was !on_rq, then only when
-        * the task is sleeping will it still have non-normalized vruntime.
-        */
-       if (!se->on_rq && p->state != TASK_RUNNING) {
-               /*
-                * Fix up our vruntime so that the current sleep doesn't
-                * cause 'unlimited' sleep bonus.
-                */
-               place_entity(cfs_rq, se, 0);
-               se->vruntime -= cfs_rq->min_vruntime;
-       }
-}
-
-/*
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
-{
-       if (!p->se.on_rq)
-               return;
-
-       /*
-        * We were most likely switched from sched_rt, so
-        * kick off the schedule if running, otherwise just see
-        * if we can still preempt the current task.
-        */
-       if (rq->curr == p)
-               resched_task(rq->curr);
-       else
-               check_preempt_curr(rq, p, 0);
-}
-
-/* Account for a task changing its policy or group.
- *
- * This routine is mostly called to set cfs_rq->curr field when a task
- * migrates between groups/classes.
- */
-static void set_curr_task_fair(struct rq *rq)
-{
-       struct sched_entity *se = &rq->curr->se;
-
-       for_each_sched_entity(se) {
-               struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
-               set_next_entity(cfs_rq, se);
-               /* ensure bandwidth has been allocated on our new cfs_rq */
-               account_cfs_rq_runtime(cfs_rq, 0);
-       }
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
-{
-       /*
-        * If the task was not on the rq at the time of this cgroup movement
-        * it must have been asleep, sleeping tasks keep their ->vruntime
-        * absolute on their old rq until wakeup (needed for the fair sleeper
-        * bonus in place_entity()).
-        *
-        * If it was on the rq, we've just 'preempted' it, which does convert
-        * ->vruntime to a relative base.
-        *
-        * Make sure both cases convert their relative position when migrating
-        * to another cgroup's rq. This does somewhat interfere with the
-        * fair sleeper stuff for the first placement, but who cares.
-        */
-       if (!on_rq)
-               p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
-       set_task_rq(p, task_cpu(p));
-       if (!on_rq)
-               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
-}
-#endif
-
-static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
-{
-       struct sched_entity *se = &task->se;
-       unsigned int rr_interval = 0;
-
-       /*
-        * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
-        * idle runqueue:
-        */
-       if (rq->cfs.load.weight)
-               rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
-
-       return rr_interval;
-}
-
-/*
- * All the scheduling class methods:
- */
-static const struct sched_class fair_sched_class = {
-       .next                   = &idle_sched_class,
-       .enqueue_task           = enqueue_task_fair,
-       .dequeue_task           = dequeue_task_fair,
-       .yield_task             = yield_task_fair,
-       .yield_to_task          = yield_to_task_fair,
-
-       .check_preempt_curr     = check_preempt_wakeup,
-
-       .pick_next_task         = pick_next_task_fair,
-       .put_prev_task          = put_prev_task_fair,
-
-#ifdef CONFIG_SMP
-       .select_task_rq         = select_task_rq_fair,
-
-       .rq_online              = rq_online_fair,
-       .rq_offline             = rq_offline_fair,
-
-       .task_waking            = task_waking_fair,
-#endif
-
-       .set_curr_task          = set_curr_task_fair,
-       .task_tick              = task_tick_fair,
-       .task_fork              = task_fork_fair,
-
-       .prio_changed           = prio_changed_fair,
-       .switched_from          = switched_from_fair,
-       .switched_to            = switched_to_fair,
-
-       .get_rr_interval        = get_rr_interval_fair,
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       .task_move_group        = task_move_group_fair,
-#endif
-};
-
-#ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu)
-{
-       struct cfs_rq *cfs_rq;
-
-       rcu_read_lock();
-       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
-               print_cfs_rq(m, cpu, cfs_rq);
-       rcu_read_unlock();
-}
-#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
deleted file mode 100644 (file)
index 8480224..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Only give sleepers 50% of their service deficit. This allows
- * them to run sooner, but does not allow tons of sleepers to
- * rip the spread apart.
- */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
-
-/*
- * Place new tasks ahead so that they do not starve already running
- * tasks
- */
-SCHED_FEAT(START_DEBIT, 1)
-
-/*
- * Based on load and program behaviour, see if it makes sense to place
- * a newly woken task on the same cpu as the task that woke it --
- * improve cache locality. Typically used with SYNC wakeups as
- * generated by pipes and the like, see also SYNC_WAKEUPS.
- */
-SCHED_FEAT(AFFINE_WAKEUPS, 1)
-
-/*
- * Prefer to schedule the task we woke last (assuming it failed
- * wakeup-preemption), since its likely going to consume data we
- * touched, increases cache locality.
- */
-SCHED_FEAT(NEXT_BUDDY, 0)
-
-/*
- * Prefer to schedule the task that ran last (when we did
- * wake-preempt) as that likely will touch the same data, increases
- * cache locality.
- */
-SCHED_FEAT(LAST_BUDDY, 1)
-
-/*
- * Consider buddies to be cache hot, decreases the likelyness of a
- * cache buddy being migrated away, increases cache locality.
- */
-SCHED_FEAT(CACHE_HOT_BUDDY, 1)
-
-/*
- * Use arch dependent cpu power functions
- */
-SCHED_FEAT(ARCH_POWER, 0)
-
-SCHED_FEAT(HRTICK, 0)
-SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(LB_BIAS, 1)
-
-/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, 1)
-
-/*
- * Decrement CPU power based on time not spent running tasks
- */
-SCHED_FEAT(NONTASK_POWER, 1)
-
-/*
- * Queue remote wakeups on the target CPU and process them
- * using the scheduler IPI. Reduces rq->lock contention/bounces.
- */
-SCHED_FEAT(TTWU_QUEUE, 1)
-
-SCHED_FEAT(FORCE_SD_OVERLAP, 0)
-SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
deleted file mode 100644 (file)
index 0a51882..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched_fair.c)
- */
-
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
-{
-       return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif /* CONFIG_SMP */
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-       resched_task(rq->idle);
-}
-
-static struct task_struct *pick_next_task_idle(struct rq *rq)
-{
-       schedstat_inc(rq, sched_goidle);
-       calc_load_account_idle(rq);
-       return rq->idle;
-}
-
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-       raw_spin_unlock_irq(&rq->lock);
-       printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-       dump_stack();
-       raw_spin_lock_irq(&rq->lock);
-}
-
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
-       BUG();
-}
-
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
-       BUG();
-}
-
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
-       return 0;
-}
-
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-static const struct sched_class idle_sched_class = {
-       /* .next is NULL */
-       /* no enqueue/yield_task for idle tasks */
-
-       /* dequeue is not valid, we print a debug message there: */
-       .dequeue_task           = dequeue_task_idle,
-
-       .check_preempt_curr     = check_preempt_curr_idle,
-
-       .pick_next_task         = pick_next_task_idle,
-       .put_prev_task          = put_prev_task_idle,
-
-#ifdef CONFIG_SMP
-       .select_task_rq         = select_task_rq_idle,
-#endif
-
-       .set_curr_task          = set_curr_task_idle,
-       .task_tick              = task_tick_idle,
-
-       .get_rr_interval        = get_rr_interval_idle,
-
-       .prio_changed           = prio_changed_idle,
-       .switched_to            = switched_to_idle,
-};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
deleted file mode 100644 (file)
index 583a136..0000000
+++ /dev/null
@@ -1,1848 +0,0 @@
-/*
- * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
- * policies)
- */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
-       WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
-       return container_of(rt_se, struct task_struct, rt);
-}
-
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-       return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-       return rt_se->rt_rq;
-}
-
-#else /* CONFIG_RT_GROUP_SCHED */
-
-#define rt_entity_is_task(rt_se) (1)
-
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-       return container_of(rt_se, struct task_struct, rt);
-}
-
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-       return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-       struct task_struct *p = rt_task_of(rt_se);
-       struct rq *rq = task_rq(p);
-
-       return &rq->rt;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
-
-static inline int rt_overloaded(struct rq *rq)
-{
-       return atomic_read(&rq->rd->rto_count);
-}
-
-static inline void rt_set_overload(struct rq *rq)
-{
-       if (!rq->online)
-               return;
-
-       cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
-       /*
-        * Make sure the mask is visible before we set
-        * the overload count. That is checked to determine
-        * if we should look at the mask. It would be a shame
-        * if we looked at the mask, but the mask was not
-        * updated yet.
-        */
-       wmb();
-       atomic_inc(&rq->rd->rto_count);
-}
-
-static inline void rt_clear_overload(struct rq *rq)
-{
-       if (!rq->online)
-               return;
-
-       /* the order here really doesn't matter */
-       atomic_dec(&rq->rd->rto_count);
-       cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
-}
-
-static void update_rt_migration(struct rt_rq *rt_rq)
-{
-       if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
-               if (!rt_rq->overloaded) {
-                       rt_set_overload(rq_of_rt_rq(rt_rq));
-                       rt_rq->overloaded = 1;
-               }
-       } else if (rt_rq->overloaded) {
-               rt_clear_overload(rq_of_rt_rq(rt_rq));
-               rt_rq->overloaded = 0;
-       }
-}
-
-static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       if (!rt_entity_is_task(rt_se))
-               return;
-
-       rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-       rt_rq->rt_nr_total++;
-       if (rt_se->nr_cpus_allowed > 1)
-               rt_rq->rt_nr_migratory++;
-
-       update_rt_migration(rt_rq);
-}
-
-static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       if (!rt_entity_is_task(rt_se))
-               return;
-
-       rt_rq = &rq_of_rt_rq(rt_rq)->rt;
-
-       rt_rq->rt_nr_total--;
-       if (rt_se->nr_cpus_allowed > 1)
-               rt_rq->rt_nr_migratory--;
-
-       update_rt_migration(rt_rq);
-}
-
-static inline int has_pushable_tasks(struct rq *rq)
-{
-       return !plist_head_empty(&rq->rt.pushable_tasks);
-}
-
-static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-       plist_node_init(&p->pushable_tasks, p->prio);
-       plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
-
-       /* Update the highest prio pushable task */
-       if (p->prio < rq->rt.highest_prio.next)
-               rq->rt.highest_prio.next = p->prio;
-}
-
-static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-
-       /* Update the new highest prio pushable task */
-       if (has_pushable_tasks(rq)) {
-               p = plist_first_entry(&rq->rt.pushable_tasks,
-                                     struct task_struct, pushable_tasks);
-               rq->rt.highest_prio.next = p->prio;
-       } else
-               rq->rt.highest_prio.next = MAX_RT_PRIO;
-}
-
-#else
-
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-}
-
-#endif /* CONFIG_SMP */
-
-static inline int on_rt_rq(struct sched_rt_entity *rt_se)
-{
-       return !list_empty(&rt_se->run_list);
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
-       if (!rt_rq->tg)
-               return RUNTIME_INF;
-
-       return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
-       return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
-}
-
-typedef struct task_group *rt_rq_iter_t;
-
-static inline struct task_group *next_task_group(struct task_group *tg)
-{
-       do {
-               tg = list_entry_rcu(tg->list.next,
-                       typeof(struct task_group), list);
-       } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
-
-       if (&tg->list == &task_groups)
-               tg = NULL;
-
-       return tg;
-}
-
-#define for_each_rt_rq(rt_rq, iter, rq)                                        \
-       for (iter = container_of(&task_groups, typeof(*iter), list);    \
-               (iter = next_task_group(iter)) &&                       \
-               (rt_rq = iter->rt_rq[cpu_of(rq)]);)
-
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-       list_add_rcu(&rt_rq->leaf_rt_rq_list,
-                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-       list_del_rcu(&rt_rq->leaf_rt_rq_list);
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-       list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
-
-#define for_each_sched_rt_entity(rt_se) \
-       for (; rt_se; rt_se = rt_se->parent)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
-       return rt_se->my_q;
-}
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-
-static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
-       struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-       struct sched_rt_entity *rt_se;
-
-       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-
-       rt_se = rt_rq->tg->rt_se[cpu];
-
-       if (rt_rq->rt_nr_running) {
-               if (rt_se && !on_rt_rq(rt_se))
-                       enqueue_rt_entity(rt_se, false);
-               if (rt_rq->highest_prio.curr < curr->prio)
-                       resched_task(curr);
-       }
-}
-
-static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
-       struct sched_rt_entity *rt_se;
-       int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-
-       rt_se = rt_rq->tg->rt_se[cpu];
-
-       if (rt_se && on_rt_rq(rt_se))
-               dequeue_rt_entity(rt_se);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-       return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-
-static int rt_se_boosted(struct sched_rt_entity *rt_se)
-{
-       struct rt_rq *rt_rq = group_rt_rq(rt_se);
-       struct task_struct *p;
-
-       if (rt_rq)
-               return !!rt_rq->rt_nr_boosted;
-
-       p = rt_task_of(rt_se);
-       return p->prio != p->normal_prio;
-}
-
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-       return cpu_rq(smp_processor_id())->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-       return cpu_online_mask;
-}
-#endif
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
-       return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
-       return &rt_rq->tg->rt_bandwidth;
-}
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
-       return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
-       return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
-       for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
-{
-}
-
-#define for_each_leaf_rt_rq(rt_rq, rq) \
-       for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
-       for (; rt_se; rt_se = NULL)
-
-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
-       return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
-       if (rt_rq->rt_nr_running)
-               resched_task(rq_of_rt_rq(rt_rq)->curr);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-       return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-       return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
-       return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
-       return &def_rt_bandwidth;
-}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
-/*
- * We ran out of runtime, see if we can borrow some from our neighbours.
- */
-static int do_balance_runtime(struct rt_rq *rt_rq)
-{
-       struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
-       int i, weight, more = 0;
-       u64 rt_period;
-
-       weight = cpumask_weight(rd->span);
-
-       raw_spin_lock(&rt_b->rt_runtime_lock);
-       rt_period = ktime_to_ns(rt_b->rt_period);
-       for_each_cpu(i, rd->span) {
-               struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
-               s64 diff;
-
-               if (iter == rt_rq)
-                       continue;
-
-               raw_spin_lock(&iter->rt_runtime_lock);
-               /*
-                * Either all rqs have inf runtime and there's nothing to steal
-                * or __disable_runtime() below sets a specific rq to inf to
-                * indicate its been disabled and disalow stealing.
-                */
-               if (iter->rt_runtime == RUNTIME_INF)
-                       goto next;
-
-               /*
-                * From runqueues with spare time, take 1/n part of their
-                * spare time, but no more than our period.
-                */
-               diff = iter->rt_runtime - iter->rt_time;
-               if (diff > 0) {
-                       diff = div_u64((u64)diff, weight);
-                       if (rt_rq->rt_runtime + diff > rt_period)
-                               diff = rt_period - rt_rq->rt_runtime;
-                       iter->rt_runtime -= diff;
-                       rt_rq->rt_runtime += diff;
-                       more = 1;
-                       if (rt_rq->rt_runtime == rt_period) {
-                               raw_spin_unlock(&iter->rt_runtime_lock);
-                               break;
-                       }
-               }
-next:
-               raw_spin_unlock(&iter->rt_runtime_lock);
-       }
-       raw_spin_unlock(&rt_b->rt_runtime_lock);
-
-       return more;
-}
-
-/*
- * Ensure this RQ takes back all the runtime it lend to its neighbours.
- */
-static void __disable_runtime(struct rq *rq)
-{
-       struct root_domain *rd = rq->rd;
-       rt_rq_iter_t iter;
-       struct rt_rq *rt_rq;
-
-       if (unlikely(!scheduler_running))
-               return;
-
-       for_each_rt_rq(rt_rq, iter, rq) {
-               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-               s64 want;
-               int i;
-
-               raw_spin_lock(&rt_b->rt_runtime_lock);
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               /*
-                * Either we're all inf and nobody needs to borrow, or we're
-                * already disabled and thus have nothing to do, or we have
-                * exactly the right amount of runtime to take out.
-                */
-               if (rt_rq->rt_runtime == RUNTIME_INF ||
-                               rt_rq->rt_runtime == rt_b->rt_runtime)
-                       goto balanced;
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-
-               /*
-                * Calculate the difference between what we started out with
-                * and what we current have, that's the amount of runtime
-                * we lend and now have to reclaim.
-                */
-               want = rt_b->rt_runtime - rt_rq->rt_runtime;
-
-               /*
-                * Greedy reclaim, take back as much as we can.
-                */
-               for_each_cpu(i, rd->span) {
-                       struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
-                       s64 diff;
-
-                       /*
-                        * Can't reclaim from ourselves or disabled runqueues.
-                        */
-                       if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
-                               continue;
-
-                       raw_spin_lock(&iter->rt_runtime_lock);
-                       if (want > 0) {
-                               diff = min_t(s64, iter->rt_runtime, want);
-                               iter->rt_runtime -= diff;
-                               want -= diff;
-                       } else {
-                               iter->rt_runtime -= want;
-                               want -= want;
-                       }
-                       raw_spin_unlock(&iter->rt_runtime_lock);
-
-                       if (!want)
-                               break;
-               }
-
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               /*
-                * We cannot be left wanting - that would mean some runtime
-                * leaked out of the system.
-                */
-               BUG_ON(want);
-balanced:
-               /*
-                * Disable all the borrow logic by pretending we have inf
-                * runtime - in which case borrowing doesn't make sense.
-                */
-               rt_rq->rt_runtime = RUNTIME_INF;
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               raw_spin_unlock(&rt_b->rt_runtime_lock);
-       }
-}
-
-static void disable_runtime(struct rq *rq)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __disable_runtime(rq);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static void __enable_runtime(struct rq *rq)
-{
-       rt_rq_iter_t iter;
-       struct rt_rq *rt_rq;
-
-       if (unlikely(!scheduler_running))
-               return;
-
-       /*
-        * Reset each runqueue's bandwidth settings
-        */
-       for_each_rt_rq(rt_rq, iter, rq) {
-               struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-
-               raw_spin_lock(&rt_b->rt_runtime_lock);
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-               rt_rq->rt_runtime = rt_b->rt_runtime;
-               rt_rq->rt_time = 0;
-               rt_rq->rt_throttled = 0;
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               raw_spin_unlock(&rt_b->rt_runtime_lock);
-       }
-}
-
-static void enable_runtime(struct rq *rq)
-{
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __enable_runtime(rq);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static int balance_runtime(struct rt_rq *rt_rq)
-{
-       int more = 0;
-
-       if (!sched_feat(RT_RUNTIME_SHARE))
-               return more;
-
-       if (rt_rq->rt_time > rt_rq->rt_runtime) {
-               raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               more = do_balance_runtime(rt_rq);
-               raw_spin_lock(&rt_rq->rt_runtime_lock);
-       }
-
-       return more;
-}
-#else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
-       return 0;
-}
-#endif /* CONFIG_SMP */
-
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
-{
-       int i, idle = 1;
-       const struct cpumask *span;
-
-       if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-               return 1;
-
-       span = sched_rt_period_mask();
-       for_each_cpu(i, span) {
-               int enqueue = 0;
-               struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-               struct rq *rq = rq_of_rt_rq(rt_rq);
-
-               raw_spin_lock(&rq->lock);
-               if (rt_rq->rt_time) {
-                       u64 runtime;
-
-                       raw_spin_lock(&rt_rq->rt_runtime_lock);
-                       if (rt_rq->rt_throttled)
-                               balance_runtime(rt_rq);
-                       runtime = rt_rq->rt_runtime;
-                       rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-                               rt_rq->rt_throttled = 0;
-                               enqueue = 1;
-
-                               /*
-                                * Force a clock update if the CPU was idle,
-                                * lest wakeup -> unthrottle time accumulate.
-                                */
-                               if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-                                       rq->skip_clock_update = -1;
-                       }
-                       if (rt_rq->rt_time || rt_rq->rt_nr_running)
-                               idle = 0;
-                       raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               } else if (rt_rq->rt_nr_running) {
-                       idle = 0;
-                       if (!rt_rq_throttled(rt_rq))
-                               enqueue = 1;
-               }
-
-               if (enqueue)
-                       sched_rt_rq_enqueue(rt_rq);
-               raw_spin_unlock(&rq->lock);
-       }
-
-       return idle;
-}
-
-static inline int rt_se_prio(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_RT_GROUP_SCHED
-       struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
-       if (rt_rq)
-               return rt_rq->highest_prio.curr;
-#endif
-
-       return rt_task_of(rt_se)->prio;
-}
-
-static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
-{
-       u64 runtime = sched_rt_runtime(rt_rq);
-
-       if (rt_rq->rt_throttled)
-               return rt_rq_throttled(rt_rq);
-
-       if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
-               return 0;
-
-       balance_runtime(rt_rq);
-       runtime = sched_rt_runtime(rt_rq);
-       if (runtime == RUNTIME_INF)
-               return 0;
-
-       if (rt_rq->rt_time > runtime) {
-               rt_rq->rt_throttled = 1;
-               printk_once(KERN_WARNING "sched: RT throttling activated\n");
-               if (rt_rq_throttled(rt_rq)) {
-                       sched_rt_rq_dequeue(rt_rq);
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-
-/*
- * Update the current task's runtime statistics. Skip current tasks that
- * are not in our scheduling class.
- */
-static void update_curr_rt(struct rq *rq)
-{
-       struct task_struct *curr = rq->curr;
-       struct sched_rt_entity *rt_se = &curr->rt;
-       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-       u64 delta_exec;
-
-       if (curr->sched_class != &rt_sched_class)
-               return;
-
-       delta_exec = rq->clock_task - curr->se.exec_start;
-       if (unlikely((s64)delta_exec < 0))
-               delta_exec = 0;
-
-       schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
-
-       curr->se.sum_exec_runtime += delta_exec;
-       account_group_exec_runtime(curr, delta_exec);
-
-       curr->se.exec_start = rq->clock_task;
-       cpuacct_charge(curr, delta_exec);
-
-       sched_rt_avg_update(rq, delta_exec);
-
-       if (!rt_bandwidth_enabled())
-               return;
-
-       for_each_sched_rt_entity(rt_se) {
-               rt_rq = rt_rq_of_se(rt_se);
-
-               if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
-                       raw_spin_lock(&rt_rq->rt_runtime_lock);
-                       rt_rq->rt_time += delta_exec;
-                       if (sched_rt_runtime_exceeded(rt_rq))
-                               resched_task(curr);
-                       raw_spin_unlock(&rt_rq->rt_runtime_lock);
-               }
-       }
-}
-
-#if defined CONFIG_SMP
-
-static void
-inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
-{
-       struct rq *rq = rq_of_rt_rq(rt_rq);
-
-       if (rq->online && prio < prev_prio)
-               cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-}
-
-static void
-dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
-{
-       struct rq *rq = rq_of_rt_rq(rt_rq);
-
-       if (rq->online && rt_rq->highest_prio.curr != prev_prio)
-               cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
-}
-
-#else /* CONFIG_SMP */
-
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-
-#endif /* CONFIG_SMP */
-
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-static void
-inc_rt_prio(struct rt_rq *rt_rq, int prio)
-{
-       int prev_prio = rt_rq->highest_prio.curr;
-
-       if (prio < prev_prio)
-               rt_rq->highest_prio.curr = prio;
-
-       inc_rt_prio_smp(rt_rq, prio, prev_prio);
-}
-
-static void
-dec_rt_prio(struct rt_rq *rt_rq, int prio)
-{
-       int prev_prio = rt_rq->highest_prio.curr;
-
-       if (rt_rq->rt_nr_running) {
-
-               WARN_ON(prio < prev_prio);
-
-               /*
-                * This may have been our highest task, and therefore
-                * we may have some recomputation to do
-                */
-               if (prio == prev_prio) {
-                       struct rt_prio_array *array = &rt_rq->active;
-
-                       rt_rq->highest_prio.curr =
-                               sched_find_first_bit(array->bitmap);
-               }
-
-       } else
-               rt_rq->highest_prio.curr = MAX_RT_PRIO;
-
-       dec_rt_prio_smp(rt_rq, prio, prev_prio);
-}
-
-#else
-
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       if (rt_se_boosted(rt_se))
-               rt_rq->rt_nr_boosted++;
-
-       if (rt_rq->tg)
-               start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-}
-
-static void
-dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       if (rt_se_boosted(rt_se))
-               rt_rq->rt_nr_boosted--;
-
-       WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-}
-
-#else /* CONFIG_RT_GROUP_SCHED */
-
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       start_rt_bandwidth(&def_rt_bandwidth);
-}
-
-static inline
-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       int prio = rt_se_prio(rt_se);
-
-       WARN_ON(!rt_prio(prio));
-       rt_rq->rt_nr_running++;
-
-       inc_rt_prio(rt_rq, prio);
-       inc_rt_migration(rt_se, rt_rq);
-       inc_rt_group(rt_se, rt_rq);
-}
-
-static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-       WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-       WARN_ON(!rt_rq->rt_nr_running);
-       rt_rq->rt_nr_running--;
-
-       dec_rt_prio(rt_rq, rt_se_prio(rt_se));
-       dec_rt_migration(rt_se, rt_rq);
-       dec_rt_group(rt_se, rt_rq);
-}
-
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
-{
-       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-       struct rt_prio_array *array = &rt_rq->active;
-       struct rt_rq *group_rq = group_rt_rq(rt_se);
-       struct list_head *queue = array->queue + rt_se_prio(rt_se);
-
-       /*
-        * Don't enqueue the group if its throttled, or when empty.
-        * The latter is a consequence of the former when a child group
-        * get throttled and the current group doesn't have any other
-        * active members.
-        */
-       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
-               return;
-
-       if (!rt_rq->rt_nr_running)
-               list_add_leaf_rt_rq(rt_rq);
-
-       if (head)
-               list_add(&rt_se->run_list, queue);
-       else
-               list_add_tail(&rt_se->run_list, queue);
-       __set_bit(rt_se_prio(rt_se), array->bitmap);
-
-       inc_rt_tasks(rt_se, rt_rq);
-}
-
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
-{
-       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-       struct rt_prio_array *array = &rt_rq->active;
-
-       list_del_init(&rt_se->run_list);
-       if (list_empty(array->queue + rt_se_prio(rt_se)))
-               __clear_bit(rt_se_prio(rt_se), array->bitmap);
-
-       dec_rt_tasks(rt_se, rt_rq);
-       if (!rt_rq->rt_nr_running)
-               list_del_leaf_rt_rq(rt_rq);
-}
-
-/*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
- */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
-{
-       struct sched_rt_entity *back = NULL;
-
-       for_each_sched_rt_entity(rt_se) {
-               rt_se->back = back;
-               back = rt_se;
-       }
-
-       for (rt_se = back; rt_se; rt_se = rt_se->back) {
-               if (on_rt_rq(rt_se))
-                       __dequeue_rt_entity(rt_se);
-       }
-}
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
-{
-       dequeue_rt_stack(rt_se);
-       for_each_sched_rt_entity(rt_se)
-               __enqueue_rt_entity(rt_se, head);
-}
-
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
-{
-       dequeue_rt_stack(rt_se);
-
-       for_each_sched_rt_entity(rt_se) {
-               struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
-               if (rt_rq && rt_rq->rt_nr_running)
-                       __enqueue_rt_entity(rt_se, false);
-       }
-}
-
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-       struct sched_rt_entity *rt_se = &p->rt;
-
-       if (flags & ENQUEUE_WAKEUP)
-               rt_se->timeout = 0;
-
-       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-
-       if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
-               enqueue_pushable_task(rq, p);
-
-       inc_nr_running(rq);
-}
-
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-       struct sched_rt_entity *rt_se = &p->rt;
-
-       update_curr_rt(rq);
-       dequeue_rt_entity(rt_se);
-
-       dequeue_pushable_task(rq, p);
-
-       dec_nr_running(rq);
-}
-
-/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void
-requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
-{
-       if (on_rt_rq(rt_se)) {
-               struct rt_prio_array *array = &rt_rq->active;
-               struct list_head *queue = array->queue + rt_se_prio(rt_se);
-
-               if (head)
-                       list_move(&rt_se->run_list, queue);
-               else
-                       list_move_tail(&rt_se->run_list, queue);
-       }
-}
-
-static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
-{
-       struct sched_rt_entity *rt_se = &p->rt;
-       struct rt_rq *rt_rq;
-
-       for_each_sched_rt_entity(rt_se) {
-               rt_rq = rt_rq_of_se(rt_se);
-               requeue_rt_entity(rt_rq, rt_se, head);
-       }
-}
-
-static void yield_task_rt(struct rq *rq)
-{
-       requeue_task_rt(rq, rq->curr, 0);
-}
-
-#ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
-
-static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
-{
-       struct task_struct *curr;
-       struct rq *rq;
-       int cpu;
-
-       cpu = task_cpu(p);
-
-       /* For anything but wake ups, just return the task_cpu */
-       if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
-               goto out;
-
-       rq = cpu_rq(cpu);
-
-       rcu_read_lock();
-       curr = ACCESS_ONCE(rq->curr); /* unlocked access */
-
-       /*
-        * If the current task on @p's runqueue is an RT task, then
-        * try to see if we can wake this RT task up on another
-        * runqueue. Otherwise simply start this RT task
-        * on its current runqueue.
-        *
-        * We want to avoid overloading runqueues. If the woken
-        * task is a higher priority, then it will stay on this CPU
-        * and the lower prio task should be moved to another CPU.
-        * Even though this will probably make the lower prio task
-        * lose its cache, we do not want to bounce a higher task
-        * around just because it gave up its CPU, perhaps for a
-        * lock?
-        *
-        * For equal prio tasks, we just let the scheduler sort it out.
-        *
-        * Otherwise, just let it ride on the affined RQ and the
-        * post-schedule router will push the preempted task away
-        *
-        * This test is optimistic, if we get it wrong the load-balancer
-        * will have to sort it out.
-        */
-       if (curr && unlikely(rt_task(curr)) &&
-           (curr->rt.nr_cpus_allowed < 2 ||
-            curr->prio <= p->prio) &&
-           (p->rt.nr_cpus_allowed > 1)) {
-               int target = find_lowest_rq(p);
-
-               if (target != -1)
-                       cpu = target;
-       }
-       rcu_read_unlock();
-
-out:
-       return cpu;
-}
-
-static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
-{
-       if (rq->curr->rt.nr_cpus_allowed == 1)
-               return;
-
-       if (p->rt.nr_cpus_allowed != 1
-           && cpupri_find(&rq->rd->cpupri, p, NULL))
-               return;
-
-       if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-               return;
-
-       /*
-        * There appears to be other cpus that can accept
-        * current and none to run 'p', so lets reschedule
-        * to try and push current away:
-        */
-       requeue_task_rt(rq, p, 1);
-       resched_task(rq->curr);
-}
-
-#endif /* CONFIG_SMP */
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
-{
-       if (p->prio < rq->curr->prio) {
-               resched_task(rq->curr);
-               return;
-       }
-
-#ifdef CONFIG_SMP
-       /*
-        * If:
-        *
-        * - the newly woken task is of equal priority to the current task
-        * - the newly woken task is non-migratable while current is migratable
-        * - current will be preempted on the next reschedule
-        *
-        * we should check to see if current can readily move to a different
-        * cpu.  If so, we will reschedule to allow the push logic to try
-        * to move current somewhere else, making room for our non-migratable
-        * task.
-        */
-       if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
-               check_preempt_equal_prio(rq, p);
-#endif
-}
-
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
-                                                  struct rt_rq *rt_rq)
-{
-       struct rt_prio_array *array = &rt_rq->active;
-       struct sched_rt_entity *next = NULL;
-       struct list_head *queue;
-       int idx;
-
-       idx = sched_find_first_bit(array->bitmap);
-       BUG_ON(idx >= MAX_RT_PRIO);
-
-       queue = array->queue + idx;
-       next = list_entry(queue->next, struct sched_rt_entity, run_list);
-
-       return next;
-}
-
-static struct task_struct *_pick_next_task_rt(struct rq *rq)
-{
-       struct sched_rt_entity *rt_se;
-       struct task_struct *p;
-       struct rt_rq *rt_rq;
-
-       rt_rq = &rq->rt;
-
-       if (!rt_rq->rt_nr_running)
-               return NULL;
-
-       if (rt_rq_throttled(rt_rq))
-               return NULL;
-
-       do {
-               rt_se = pick_next_rt_entity(rq, rt_rq);
-               BUG_ON(!rt_se);
-               rt_rq = group_rt_rq(rt_se);
-       } while (rt_rq);
-
-       p = rt_task_of(rt_se);
-       p->se.exec_start = rq->clock_task;
-
-       return p;
-}
-
-static struct task_struct *pick_next_task_rt(struct rq *rq)
-{
-       struct task_struct *p = _pick_next_task_rt(rq);
-
-       /* The running task is never eligible for pushing */
-       if (p)
-               dequeue_pushable_task(rq, p);
-
-#ifdef CONFIG_SMP
-       /*
-        * We detect this state here so that we can avoid taking the RQ
-        * lock again later if there is no need to push
-        */
-       rq->post_schedule = has_pushable_tasks(rq);
-#endif
-
-       return p;
-}
-
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
-{
-       update_curr_rt(rq);
-
-       /*
-        * The previous task needs to be made eligible for pushing
-        * if it is still active
-        */
-       if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
-               enqueue_pushable_task(rq, p);
-}
-
-#ifdef CONFIG_SMP
-
-/* Only try algorithms three times */
-#define RT_MAX_TRIES 3
-
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
-
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
-       if (!task_running(rq, p) &&
-           (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-           (p->rt.nr_cpus_allowed > 1))
-               return 1;
-       return 0;
-}
-
-/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
-{
-       struct task_struct *next = NULL;
-       struct sched_rt_entity *rt_se;
-       struct rt_prio_array *array;
-       struct rt_rq *rt_rq;
-       int idx;
-
-       for_each_leaf_rt_rq(rt_rq, rq) {
-               array = &rt_rq->active;
-               idx = sched_find_first_bit(array->bitmap);
-next_idx:
-               if (idx >= MAX_RT_PRIO)
-                       continue;
-               if (next && next->prio < idx)
-                       continue;
-               list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                       struct task_struct *p;
-
-                       if (!rt_entity_is_task(rt_se))
-                               continue;
-
-                       p = rt_task_of(rt_se);
-                       if (pick_rt_task(rq, p, cpu)) {
-                               next = p;
-                               break;
-                       }
-               }
-               if (!next) {
-                       idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-                       goto next_idx;
-               }
-       }
-
-       return next;
-}
-
-static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
-
-static int find_lowest_rq(struct task_struct *task)
-{
-       struct sched_domain *sd;
-       struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
-       int this_cpu = smp_processor_id();
-       int cpu      = task_cpu(task);
-
-       /* Make sure the mask is initialized first */
-       if (unlikely(!lowest_mask))
-               return -1;
-
-       if (task->rt.nr_cpus_allowed == 1)
-               return -1; /* No other targets possible */
-
-       if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
-               return -1; /* No targets found */
-
-       /*
-        * At this point we have built a mask of cpus representing the
-        * lowest priority tasks in the system.  Now we want to elect
-        * the best one based on our affinity and topology.
-        *
-        * We prioritize the last cpu that the task executed on since
-        * it is most likely cache-hot in that location.
-        */
-       if (cpumask_test_cpu(cpu, lowest_mask))
-               return cpu;
-
-       /*
-        * Otherwise, we consult the sched_domains span maps to figure
-        * out which cpu is logically closest to our hot cache data.
-        */
-       if (!cpumask_test_cpu(this_cpu, lowest_mask))
-               this_cpu = -1; /* Skip this_cpu opt if not among lowest */
-
-       rcu_read_lock();
-       for_each_domain(cpu, sd) {
-               if (sd->flags & SD_WAKE_AFFINE) {
-                       int best_cpu;
-
-                       /*
-                        * "this_cpu" is cheaper to preempt than a
-                        * remote processor.
-                        */
-                       if (this_cpu != -1 &&
-                           cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
-                               rcu_read_unlock();
-                               return this_cpu;
-                       }
-
-                       best_cpu = cpumask_first_and(lowest_mask,
-                                                    sched_domain_span(sd));
-                       if (best_cpu < nr_cpu_ids) {
-                               rcu_read_unlock();
-                               return best_cpu;
-                       }
-               }
-       }
-       rcu_read_unlock();
-
-       /*
-        * And finally, if there were no matches within the domains
-        * just give the caller *something* to work with from the compatible
-        * locations.
-        */
-       if (this_cpu != -1)
-               return this_cpu;
-
-       cpu = cpumask_any(lowest_mask);
-       if (cpu < nr_cpu_ids)
-               return cpu;
-       return -1;
-}
-
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
-{
-       struct rq *lowest_rq = NULL;
-       int tries;
-       int cpu;
-
-       for (tries = 0; tries < RT_MAX_TRIES; tries++) {
-               cpu = find_lowest_rq(task);
-
-               if ((cpu == -1) || (cpu == rq->cpu))
-                       break;
-
-               lowest_rq = cpu_rq(cpu);
-
-               /* if the prio of this runqueue changed, try again */
-               if (double_lock_balance(rq, lowest_rq)) {
-                       /*
-                        * We had to unlock the run queue. In
-                        * the mean time, task could have
-                        * migrated already or had its affinity changed.
-                        * Also make sure that it wasn't scheduled on its rq.
-                        */
-                       if (unlikely(task_rq(task) != rq ||
-                                    !cpumask_test_cpu(lowest_rq->cpu,
-                                                      tsk_cpus_allowed(task)) ||
-                                    task_running(rq, task) ||
-                                    !task->on_rq)) {
-
-                               raw_spin_unlock(&lowest_rq->lock);
-                               lowest_rq = NULL;
-                               break;
-                       }
-               }
-
-               /* If this rq is still suitable use it. */
-               if (lowest_rq->rt.highest_prio.curr > task->prio)
-                       break;
-
-               /* try again */
-               double_unlock_balance(rq, lowest_rq);
-               lowest_rq = NULL;
-       }
-
-       return lowest_rq;
-}
-
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
-       struct task_struct *p;
-
-       if (!has_pushable_tasks(rq))
-               return NULL;
-
-       p = plist_first_entry(&rq->rt.pushable_tasks,
-                             struct task_struct, pushable_tasks);
-
-       BUG_ON(rq->cpu != task_cpu(p));
-       BUG_ON(task_current(rq, p));
-       BUG_ON(p->rt.nr_cpus_allowed <= 1);
-
-       BUG_ON(!p->on_rq);
-       BUG_ON(!rt_task(p));
-
-       return p;
-}
-
-/*
- * If the current CPU has more than one RT task, see if the non
- * running task can migrate over to a CPU that is running a task
- * of lesser priority.
- */
-static int push_rt_task(struct rq *rq)
-{
-       struct task_struct *next_task;
-       struct rq *lowest_rq;
-       int ret = 0;
-
-       if (!rq->rt.overloaded)
-               return 0;
-
-       next_task = pick_next_pushable_task(rq);
-       if (!next_task)
-               return 0;
-
-retry:
-       if (unlikely(next_task == rq->curr)) {
-               WARN_ON(1);
-               return 0;
-       }
-
-       /*
-        * It's possible that the next_task slipped in of
-        * higher priority than current. If that's the case
-        * just reschedule current.
-        */
-       if (unlikely(next_task->prio < rq->curr->prio)) {
-               resched_task(rq->curr);
-               return 0;
-       }
-
-       /* We might release rq lock */
-       get_task_struct(next_task);
-
-       /* find_lock_lowest_rq locks the rq if found */
-       lowest_rq = find_lock_lowest_rq(next_task, rq);
-       if (!lowest_rq) {
-               struct task_struct *task;
-               /*
-                * find_lock_lowest_rq releases rq->lock
-                * so it is possible that next_task has migrated.
-                *
-                * We need to make sure that the task is still on the same
-                * run-queue and is also still the next task eligible for
-                * pushing.
-                */
-               task = pick_next_pushable_task(rq);
-               if (task_cpu(next_task) == rq->cpu && task == next_task) {
-                       /*
-                        * The task hasn't migrated, and is still the next
-                        * eligible task, but we failed to find a run-queue
-                        * to push it to.  Do not retry in this case, since
-                        * other cpus will pull from us when ready.
-                        */
-                       goto out;
-               }
-
-               if (!task)
-                       /* No more tasks, just exit */
-                       goto out;
-
-               /*
-                * Something has shifted, try again.
-                */
-               put_task_struct(next_task);
-               next_task = task;
-               goto retry;
-       }
-
-       deactivate_task(rq, next_task, 0);
-       set_task_cpu(next_task, lowest_rq->cpu);
-       activate_task(lowest_rq, next_task, 0);
-       ret = 1;
-
-       resched_task(lowest_rq->curr);
-
-       double_unlock_balance(rq, lowest_rq);
-
-out:
-       put_task_struct(next_task);
-
-       return ret;
-}
-
-static void push_rt_tasks(struct rq *rq)
-{
-       /* push_rt_task will return true if it moved an RT */
-       while (push_rt_task(rq))
-               ;
-}
-
-static int pull_rt_task(struct rq *this_rq)
-{
-       int this_cpu = this_rq->cpu, ret = 0, cpu;
-       struct task_struct *p;
-       struct rq *src_rq;
-
-       if (likely(!rt_overloaded(this_rq)))
-               return 0;
-
-       for_each_cpu(cpu, this_rq->rd->rto_mask) {
-               if (this_cpu == cpu)
-                       continue;
-
-               src_rq = cpu_rq(cpu);
-
-               /*
-                * Don't bother taking the src_rq->lock if the next highest
-                * task is known to be lower-priority than our current task.
-                * This may look racy, but if this value is about to go
-                * logically higher, the src_rq will push this task away.
-                * And if its going logically lower, we do not care
-                */
-               if (src_rq->rt.highest_prio.next >=
-                   this_rq->rt.highest_prio.curr)
-                       continue;
-
-               /*
-                * We can potentially drop this_rq's lock in
-                * double_lock_balance, and another CPU could
-                * alter this_rq
-                */
-               double_lock_balance(this_rq, src_rq);
-
-               /*
-                * Are there still pullable RT tasks?
-                */
-               if (src_rq->rt.rt_nr_running <= 1)
-                       goto skip;
-
-               p = pick_next_highest_task_rt(src_rq, this_cpu);
-
-               /*
-                * Do we have an RT task that preempts
-                * the to-be-scheduled task?
-                */
-               if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
-                       WARN_ON(p == src_rq->curr);
-                       WARN_ON(!p->on_rq);
-
-                       /*
-                        * There's a chance that p is higher in priority
-                        * than what's currently running on its cpu.
-                        * This is just that p is wakeing up and hasn't
-                        * had a chance to schedule. We only pull
-                        * p if it is lower in priority than the
-                        * current task on the run queue
-                        */
-                       if (p->prio < src_rq->curr->prio)
-                               goto skip;
-
-                       ret = 1;
-
-                       deactivate_task(src_rq, p, 0);
-                       set_task_cpu(p, this_cpu);
-                       activate_task(this_rq, p, 0);
-                       /*
-                        * We continue with the search, just in
-                        * case there's an even higher prio task
-                        * in another runqueue. (low likelihood
-                        * but possible)
-                        */
-               }
-skip:
-               double_unlock_balance(this_rq, src_rq);
-       }
-
-       return ret;
-}
-
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-       /* Try to pull RT tasks here if we lower this rq's prio */
-       if (rq->rt.highest_prio.curr > prev->prio)
-               pull_rt_task(rq);
-}
-
-static void post_schedule_rt(struct rq *rq)
-{
-       push_rt_tasks(rq);
-}
-
-/*
- * If we are not running and we are not going to reschedule soon, we should
- * try to push tasks away now
- */
-static void task_woken_rt(struct rq *rq, struct task_struct *p)
-{
-       if (!task_running(rq, p) &&
-           !test_tsk_need_resched(rq->curr) &&
-           has_pushable_tasks(rq) &&
-           p->rt.nr_cpus_allowed > 1 &&
-           rt_task(rq->curr) &&
-           (rq->curr->rt.nr_cpus_allowed < 2 ||
-            rq->curr->prio <= p->prio))
-               push_rt_tasks(rq);
-}
-
-static void set_cpus_allowed_rt(struct task_struct *p,
-                               const struct cpumask *new_mask)
-{
-       int weight = cpumask_weight(new_mask);
-
-       BUG_ON(!rt_task(p));
-
-       /*
-        * Update the migration status of the RQ if we have an RT task
-        * which is running AND changing its weight value.
-        */
-       if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
-               struct rq *rq = task_rq(p);
-
-               if (!task_current(rq, p)) {
-                       /*
-                        * Make sure we dequeue this task from the pushable list
-                        * before going further.  It will either remain off of
-                        * the list because we are no longer pushable, or it
-                        * will be requeued.
-                        */
-                       if (p->rt.nr_cpus_allowed > 1)
-                               dequeue_pushable_task(rq, p);
-
-                       /*
-                        * Requeue if our weight is changing and still > 1
-                        */
-                       if (weight > 1)
-                               enqueue_pushable_task(rq, p);
-
-               }
-
-               if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
-                       rq->rt.rt_nr_migratory++;
-               } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
-                       BUG_ON(!rq->rt.rt_nr_migratory);
-                       rq->rt.rt_nr_migratory--;
-               }
-
-               update_rt_migration(&rq->rt);
-       }
-}
-
-/* Assumes rq->lock is held */
-static void rq_online_rt(struct rq *rq)
-{
-       if (rq->rt.overloaded)
-               rt_set_overload(rq);
-
-       __enable_runtime(rq);
-
-       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
-}
-
-/* Assumes rq->lock is held */
-static void rq_offline_rt(struct rq *rq)
-{
-       if (rq->rt.overloaded)
-               rt_clear_overload(rq);
-
-       __disable_runtime(rq);
-
-       cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
-}
-
-/*
- * When switch from the rt queue, we bring ourselves to a position
- * that we might want to pull RT tasks from other runqueues.
- */
-static void switched_from_rt(struct rq *rq, struct task_struct *p)
-{
-       /*
-        * If there are other RT tasks then we will reschedule
-        * and the scheduling of the other RT tasks will handle
-        * the balancing. But if we are the last RT task
-        * we may need to handle the pulling of RT tasks
-        * now.
-        */
-       if (p->on_rq && !rq->rt.rt_nr_running)
-               pull_rt_task(rq);
-}
-
-static inline void init_sched_rt_class(void)
-{
-       unsigned int i;
-
-       for_each_possible_cpu(i)
-               zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
-                                       GFP_KERNEL, cpu_to_node(i));
-}
-#endif /* CONFIG_SMP */
-
-/*
- * When switching a task to RT, we may overload the runqueue
- * with RT tasks. In this case we try to push them off to
- * other runqueues.
- */
-static void switched_to_rt(struct rq *rq, struct task_struct *p)
-{
-       int check_resched = 1;
-
-       /*
-        * If we are already running, then there's nothing
-        * that needs to be done. But if we are not running
-        * we may need to preempt the current running task.
-        * If that current running task is also an RT task
-        * then see if we can move to another run queue.
-        */
-       if (p->on_rq && rq->curr != p) {
-#ifdef CONFIG_SMP
-               if (rq->rt.overloaded && push_rt_task(rq) &&
-                   /* Don't resched if we changed runqueues */
-                   rq != task_rq(p))
-                       check_resched = 0;
-#endif /* CONFIG_SMP */
-               if (check_resched && p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
-       }
-}
-
-/*
- * Priority of the task has changed. This may cause
- * us to initiate a push or pull.
- */
-static void
-prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
-{
-       if (!p->on_rq)
-               return;
-
-       if (rq->curr == p) {
-#ifdef CONFIG_SMP
-               /*
-                * If our priority decreases while running, we
-                * may need to pull tasks to this runqueue.
-                */
-               if (oldprio < p->prio)
-                       pull_rt_task(rq);
-               /*
-                * If there's a higher priority task waiting to run
-                * then reschedule. Note, the above pull_rt_task
-                * can release the rq lock and p could migrate.
-                * Only reschedule if p is still on the same runqueue.
-                */
-               if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
-                       resched_task(p);
-#else
-               /* For UP simply resched on drop of prio */
-               if (oldprio < p->prio)
-                       resched_task(p);
-#endif /* CONFIG_SMP */
-       } else {
-               /*
-                * This task is not running, but if it is
-                * greater than the current running task
-                * then reschedule.
-                */
-               if (p->prio < rq->curr->prio)
-                       resched_task(rq->curr);
-       }
-}
-
-static void watchdog(struct rq *rq, struct task_struct *p)
-{
-       unsigned long soft, hard;
-
-       /* max may change after cur was read, this will be fixed next tick */
-       soft = task_rlimit(p, RLIMIT_RTTIME);
-       hard = task_rlimit_max(p, RLIMIT_RTTIME);
-
-       if (soft != RLIM_INFINITY) {
-               unsigned long next;
-
-               p->rt.timeout++;
-               next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
-               if (p->rt.timeout > next)
-                       p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
-       }
-}
-
-static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
-{
-       update_curr_rt(rq);
-
-       watchdog(rq, p);
-
-       /*
-        * RR tasks need a special form of timeslice management.
-        * FIFO tasks have no timeslices.
-        */
-       if (p->policy != SCHED_RR)
-               return;
-
-       if (--p->rt.time_slice)
-               return;
-
-       p->rt.time_slice = DEF_TIMESLICE;
-
-       /*
-        * Requeue to the end of queue if we are not the only element
-        * on the queue:
-        */
-       if (p->rt.run_list.prev != p->rt.run_list.next) {
-               requeue_task_rt(rq, p, 0);
-               set_tsk_need_resched(p);
-       }
-}
-
-static void set_curr_task_rt(struct rq *rq)
-{
-       struct task_struct *p = rq->curr;
-
-       p->se.exec_start = rq->clock_task;
-
-       /* The running task is never eligible for pushing */
-       dequeue_pushable_task(rq, p);
-}
-
-static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
-{
-       /*
-        * Time slice is 0 for SCHED_FIFO tasks
-        */
-       if (task->policy == SCHED_RR)
-               return DEF_TIMESLICE;
-       else
-               return 0;
-}
-
-static const struct sched_class rt_sched_class = {
-       .next                   = &fair_sched_class,
-       .enqueue_task           = enqueue_task_rt,
-       .dequeue_task           = dequeue_task_rt,
-       .yield_task             = yield_task_rt,
-
-       .check_preempt_curr     = check_preempt_curr_rt,
-
-       .pick_next_task         = pick_next_task_rt,
-       .put_prev_task          = put_prev_task_rt,
-
-#ifdef CONFIG_SMP
-       .select_task_rq         = select_task_rq_rt,
-
-       .set_cpus_allowed       = set_cpus_allowed_rt,
-       .rq_online              = rq_online_rt,
-       .rq_offline             = rq_offline_rt,
-       .pre_schedule           = pre_schedule_rt,
-       .post_schedule          = post_schedule_rt,
-       .task_woken             = task_woken_rt,
-       .switched_from          = switched_from_rt,
-#endif
-
-       .set_curr_task          = set_curr_task_rt,
-       .task_tick              = task_tick_rt,
-
-       .get_rr_interval        = get_rr_interval_rt,
-
-       .prio_changed           = prio_changed_rt,
-       .switched_to            = switched_to_rt,
-};
-
-#ifdef CONFIG_SCHED_DEBUG
-extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-
-static void print_rt_stats(struct seq_file *m, int cpu)
-{
-       rt_rq_iter_t iter;
-       struct rt_rq *rt_rq;
-
-       rcu_read_lock();
-       for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
-               print_rt_rq(m, cpu, rt_rq);
-       rcu_read_unlock();
-}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
deleted file mode 100644 (file)
index 87f9e36..0000000
+++ /dev/null
@@ -1,336 +0,0 @@
-
-#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-       int cpu;
-       int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-       char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
-       if (mask_str == NULL)
-               return -ENOMEM;
-
-       seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-       seq_printf(seq, "timestamp %lu\n", jiffies);
-       for_each_online_cpu(cpu) {
-               struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-               struct sched_domain *sd;
-               int dcount = 0;
-#endif
-
-               /* runqueue-specific stats */
-               seq_printf(seq,
-                   "cpu%d %u %u %u %u %u %u %llu %llu %lu",
-                   cpu, rq->yld_count,
-                   rq->sched_switch, rq->sched_count, rq->sched_goidle,
-                   rq->ttwu_count, rq->ttwu_local,
-                   rq->rq_cpu_time,
-                   rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
-               seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
-               /* domain-specific stats */
-               rcu_read_lock();
-               for_each_domain(cpu, sd) {
-                       enum cpu_idle_type itype;
-
-                       cpumask_scnprintf(mask_str, mask_len,
-                                         sched_domain_span(sd));
-                       seq_printf(seq, "domain%d %s", dcount++, mask_str);
-                       for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-                                       itype++) {
-                               seq_printf(seq, " %u %u %u %u %u %u %u %u",
-                                   sd->lb_count[itype],
-                                   sd->lb_balanced[itype],
-                                   sd->lb_failed[itype],
-                                   sd->lb_imbalance[itype],
-                                   sd->lb_gained[itype],
-                                   sd->lb_hot_gained[itype],
-                                   sd->lb_nobusyq[itype],
-                                   sd->lb_nobusyg[itype]);
-                       }
-                       seq_printf(seq,
-                                  " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-                           sd->alb_count, sd->alb_failed, sd->alb_pushed,
-                           sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-                           sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-                           sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                           sd->ttwu_move_balance);
-               }
-               rcu_read_unlock();
-#endif
-       }
-       kfree(mask_str);
-       return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-       unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-       char *buf = kmalloc(size, GFP_KERNEL);
-       struct seq_file *m;
-       int res;
-
-       if (!buf)
-               return -ENOMEM;
-       res = single_open(file, show_schedstat, NULL);
-       if (!res) {
-               m = file->private_data;
-               m->buf = buf;
-               m->size = size;
-       } else
-               kfree(buf);
-       return res;
-}
-
-static const struct file_operations proc_schedstat_operations = {
-       .open    = schedstat_open,
-       .read    = seq_read,
-       .llseek  = seq_lseek,
-       .release = single_release,
-};
-
-static int __init proc_schedstat_init(void)
-{
-       proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-       return 0;
-}
-module_init(proc_schedstat_init);
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{
-       if (rq) {
-               rq->rq_sched_info.run_delay += delta;
-               rq->rq_sched_info.pcount++;
-       }
-}
-
-/*
- * Expects runqueue lock to be held for atomicity of update
- */
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{
-       if (rq)
-               rq->rq_cpu_time += delta;
-}
-
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{
-       if (rq)
-               rq->rq_sched_info.run_delay += delta;
-}
-# define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)       do { var = (val); } while (0)
-#else /* !CONFIG_SCHEDSTATS */
-static inline void
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
-{}
-static inline void
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
-{}
-# define schedstat_inc(rq, field)      do { } while (0)
-# define schedstat_add(rq, field, amt) do { } while (0)
-# define schedstat_set(var, val)       do { } while (0)
-#endif
-
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
-static inline void sched_info_reset_dequeued(struct task_struct *t)
-{
-       t->sched_info.last_queued = 0;
-}
-
-/*
- * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
- * delta taken on each cpu would annul the skew.
- */
-static inline void sched_info_dequeued(struct task_struct *t)
-{
-       unsigned long long now = task_rq(t)->clock, delta = 0;
-
-       if (unlikely(sched_info_on()))
-               if (t->sched_info.last_queued)
-                       delta = now - t->sched_info.last_queued;
-       sched_info_reset_dequeued(t);
-       t->sched_info.run_delay += delta;
-
-       rq_sched_info_dequeued(task_rq(t), delta);
-}
-
-/*
- * Called when a task finally hits the cpu.  We can now calculate how
- * long it was waiting to run.  We also note when it began so that we
- * can keep stats on how long its timeslice is.
- */
-static void sched_info_arrive(struct task_struct *t)
-{
-       unsigned long long now = task_rq(t)->clock, delta = 0;
-
-       if (t->sched_info.last_queued)
-               delta = now - t->sched_info.last_queued;
-       sched_info_reset_dequeued(t);
-       t->sched_info.run_delay += delta;
-       t->sched_info.last_arrival = now;
-       t->sched_info.pcount++;
-
-       rq_sched_info_arrive(task_rq(t), delta);
-}
-
-/*
- * This function is only called from enqueue_task(), but also only updates
- * the timestamp if it is already not set.  It's assumed that
- * sched_info_dequeued() will clear that stamp when appropriate.
- */
-static inline void sched_info_queued(struct task_struct *t)
-{
-       if (unlikely(sched_info_on()))
-               if (!t->sched_info.last_queued)
-                       t->sched_info.last_queued = task_rq(t)->clock;
-}
-
-/*
- * Called when a process ceases being the active-running process, either
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
- * Also, if the process is still in the TASK_RUNNING state, call
- * sched_info_queued() to mark that it has now again started waiting on
- * the runqueue.
- */
-static inline void sched_info_depart(struct task_struct *t)
-{
-       unsigned long long delta = task_rq(t)->clock -
-                                       t->sched_info.last_arrival;
-
-       rq_sched_info_depart(task_rq(t), delta);
-
-       if (t->state == TASK_RUNNING)
-               sched_info_queued(t);
-}
-
-/*
- * Called when tasks are switched involuntarily due, typically, to expiring
- * their time slice.  (This may also be called when switching to or from
- * the idle task.)  We are only called when prev != next.
- */
-static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-       struct rq *rq = task_rq(prev);
-
-       /*
-        * prev now departs the cpu.  It's not interesting to record
-        * stats about how efficient we were at scheduling the idle
-        * process, however.
-        */
-       if (prev != rq->idle)
-               sched_info_depart(prev);
-
-       if (next != rq->idle)
-               sched_info_arrive(next);
-}
-static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
-{
-       if (unlikely(sched_info_on()))
-               __sched_info_switch(prev, next);
-}
-#else
-#define sched_info_queued(t)                   do { } while (0)
-#define sched_info_reset_dequeued(t)   do { } while (0)
-#define sched_info_dequeued(t)                 do { } while (0)
-#define sched_info_switch(t, next)             do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick.  None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk:       Pointer to task structure.
- * @cputime:   Time value by which to increment the utime field of the
- *             thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
-                                          cputime_t cputime)
-{
-       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-       if (!cputimer->running)
-               return;
-
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.utime =
-               cputime_add(cputimer->cputime.utime, cputime);
-       raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk:       Pointer to task structure.
- * @cputime:   Time value by which to increment the stime field of the
- *             thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
-                                            cputime_t cputime)
-{
-       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-       if (!cputimer->running)
-               return;
-
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.stime =
-               cputime_add(cputimer->cputime.stime, cputime);
-       raw_spin_unlock(&cputimer->lock);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk:       Pointer to task structure.
- * @ns:                Time value by which to increment the sum_exec_runtime field
- *             of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
-                                             unsigned long long ns)
-{
-       struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
-       if (!cputimer->running)
-               return;
-
-       raw_spin_lock(&cputimer->lock);
-       cputimer->cputime.sum_exec_runtime += ns;
-       raw_spin_unlock(&cputimer->lock);
-}
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
deleted file mode 100644 (file)
index 8b44e7f..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * stop-task scheduling class.
- *
- * The stop task is the highest priority task in the system, it preempts
- * everything and will be preempted by nothing.
- *
- * See kernel/stop_machine.c
- */
-
-#ifdef CONFIG_SMP
-static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
-{
-       return task_cpu(p); /* stop tasks as never migrate */
-}
-#endif /* CONFIG_SMP */
-
-static void
-check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-       /* we're never preempted */
-}
-
-static struct task_struct *pick_next_task_stop(struct rq *rq)
-{
-       struct task_struct *stop = rq->stop;
-
-       if (stop && stop->on_rq)
-               return stop;
-
-       return NULL;
-}
-
-static void
-enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-       inc_nr_running(rq);
-}
-
-static void
-dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
-{
-       dec_nr_running(rq);
-}
-
-static void yield_task_stop(struct rq *rq)
-{
-       BUG(); /* the stop task should never yield, its pointless. */
-}
-
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
-{
-}
-
-static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-
-static void set_curr_task_stop(struct rq *rq)
-{
-}
-
-static void switched_to_stop(struct rq *rq, struct task_struct *p)
-{
-       BUG(); /* its impossible to change to this class */
-}
-
-static void
-prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
-{
-       BUG(); /* how!?, what priority? */
-}
-
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
-       return 0;
-}
-
-/*
- * Simple, special scheduling class for the per-CPU stop tasks:
- */
-static const struct sched_class stop_sched_class = {
-       .next                   = &rt_sched_class,
-
-       .enqueue_task           = enqueue_task_stop,
-       .dequeue_task           = dequeue_task_stop,
-       .yield_task             = yield_task_stop,
-
-       .check_preempt_curr     = check_preempt_curr_stop,
-
-       .pick_next_task         = pick_next_task_stop,
-       .put_prev_task          = put_prev_task_stop,
-
-#ifdef CONFIG_SMP
-       .select_task_rq         = select_task_rq_stop,
-#endif
-
-       .set_curr_task          = set_curr_task_stop,
-       .task_tick              = task_tick_stop,
-
-       .get_rr_interval        = get_rr_interval_stop,
-
-       .prio_changed           = prio_changed_stop,
-       .switched_to            = switched_to_stop,
-};
index 2065515..56ce3a6 100644 (file)
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = __task_cred(tsk)->uid;
        rcu_read_unlock();
 
-       info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
-                               tsk->signal->utime));
-       info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-                               tsk->signal->stime));
+       info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
+       info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
 
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
index 2c71d91..4eb3a0f 100644 (file)
@@ -347,12 +347,12 @@ void irq_exit(void)
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
 
-       rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-               tick_nohz_stop_sched_tick(0);
+               tick_nohz_irq_exit();
 #endif
+       rcu_irq_exit();
        preempt_enable_no_resched();
 }
 
index 481611f..ddf8155 100644 (file)
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unsigned long maxrss = 0;
 
        memset((char *) r, 0, sizeof *r);
-       utime = stime = cputime_zero;
+       utime = stime = 0;
 
        if (who == RUSAGE_THREAD) {
                task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 
                case RUSAGE_SELF:
                        thread_group_times(p, &tgutime, &tgstime);
-                       utime = cputime_add(utime, tgutime);
-                       stime = cputime_add(stime, tgstime);
+                       utime += tgutime;
+                       stime += tgstime;
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
index 4042064..7656642 100644 (file)
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
-/**
- * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- * Called either from the idle loop or from irq_exit() when an idle period was
- * just interrupted by an interrupt which did not cause a reschedule.
- */
-void tick_nohz_stop_sched_tick(int inidle)
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
 {
-       unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
-       struct tick_sched *ts;
+       unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
        int cpu;
 
-       local_irq_save(flags);
-
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
 
-       /*
-        * Call to tick_nohz_start_idle stops the last_update_time from being
-        * updated. Thus, it must not be called in the event we are called from
-        * irq_exit() with the prior state different than idle.
-        */
-       if (!inidle && !ts->inidle)
-               goto end;
-
-       /*
-        * Set ts->inidle unconditionally. Even if the system did not
-        * switch to NOHZ mode the cpu frequency governers rely on the
-        * update of the idle time accounting in tick_nohz_start_idle().
-        */
-       ts->inidle = 1;
-
        now = tick_nohz_start_idle(cpu, ts);
 
        /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
        }
 
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-               goto end;
+               return;
 
        if (need_resched())
-               goto end;
+               return;
 
        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
                static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
-               goto end;
+               return;
        }
 
        ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
-                       rcu_enter_nohz();
                }
 
                ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
        ts->sleep_length = ktime_sub(dev->next_event, now);
-end:
-       local_irq_restore(flags);
+}
+
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ *  to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+       struct tick_sched *ts;
+
+       WARN_ON_ONCE(irqs_disabled());
+
+       /*
+        * Update the idle state in the scheduler domain hierarchy
+        * when tick_nohz_stop_sched_tick() is called from the idle loop.
+        * State will be updated to busy during the first busy tick after
+        * exiting idle.
+        */
+       set_cpu_sd_state_idle();
+
+       local_irq_disable();
+
+       ts = &__get_cpu_var(tick_cpu_sched);
+       /*
+        * set ts->inidle unconditionally. even if the system did not
+        * switch to nohz mode the cpu frequency governers rely on the
+        * update of the idle time accounting in tick_nohz_start_idle().
+        */
+       ts->inidle = 1;
+       tick_nohz_stop_sched_tick(ts);
+
+       local_irq_enable();
+}
+
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+       struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+       if (!ts->inidle)
+               return;
+
+       tick_nohz_stop_sched_tick(ts);
 }
 
 /**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 }
 
 /**
- * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
  *
  * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
  */
-void tick_nohz_restart_sched_tick(void)
+void tick_nohz_idle_exit(void)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
 
        local_irq_disable();
+
        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
                now = ktime_get();
 
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
 
        ts->inidle = 0;
 
-       rcu_exit_nohz();
-
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
index 2378413..0c63581 100644 (file)
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 
-       /* return delta convert to nanoseconds using ntp adjusted mult. */
+       /* return delta convert to nanoseconds. */
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
 
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset)
         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
         *
         * Note we subtract one in the shift, so that error is really error*2.
-        * This "saves" dividing(shifting) intererval twice, but keeps the
-        * (error > interval) comparision as still measuring if error is
+        * This "saves" dividing(shifting) interval twice, but keeps the
+        * (error > interval) comparison as still measuring if error is
         * larger then half an interval.
         *
-        * Note: It does not "save" on aggrivation when reading the code.
+        * Note: It does not "save" on aggravation when reading the code.
         */
        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
        if (error > interval) {
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset)
                 * nanosecond, and store the amount rounded up into
                 * the error. This causes the likely below to be unlikely.
                 *
-                * The properfix is to avoid rounding up by using
+                * The proper fix is to avoid rounding up by using
                 * the high precision timekeeper.xtime_nsec instead of
                 * xtime.tv_nsec everywhere. Fixing this will take some
                 * time.
index 9c3c62b..a297ffc 100644 (file)
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
        }
 }
 
+/* Stub timer callback for improperly used timers. */
+static void stub_timer(unsigned long data)
+{
+       WARN_ON(1);
+}
+
 /*
  * fixup_activate is called when:
  * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
                        debug_object_activate(timer, &timer_debug_descr);
                        return 0;
                } else {
-                       WARN_ON_ONCE(1);
+                       setup_timer(timer, stub_timer, 0);
+                       return 1;
                }
                return 0;
 
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
        }
 }
 
+/*
+ * fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+       struct timer_list *timer = addr;
+
+       switch (state) {
+       case ODEBUG_STATE_NOTAVAILABLE:
+               if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+                       /*
+                        * This is not really a fixup. The timer was
+                        * statically initialized. We just make sure that it
+                        * is tracked in the object tracker.
+                        */
+                       debug_object_init(timer, &timer_debug_descr);
+                       return 0;
+               } else {
+                       setup_timer(timer, stub_timer, 0);
+                       return 1;
+               }
+       default:
+               return 0;
+       }
+}
+
 static struct debug_obj_descr timer_debug_descr = {
-       .name           = "timer_list",
-       .debug_hint     = timer_debug_hint,
-       .fixup_init     = timer_fixup_init,
-       .fixup_activate = timer_fixup_activate,
-       .fixup_free     = timer_fixup_free,
+       .name                   = "timer_list",
+       .debug_hint             = timer_debug_hint,
+       .fixup_init             = timer_fixup_init,
+       .fixup_activate         = timer_fixup_activate,
+       .fixup_free             = timer_fixup_free,
+       .fixup_assert_init      = timer_fixup_assert_init,
 };
 
 static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
        debug_object_free(timer, &timer_debug_descr);
 }
 
+static inline void debug_timer_assert_init(struct timer_list *timer)
+{
+       debug_object_assert_init(timer, &timer_debug_descr);
+}
+
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 static inline void debug_timer_init(struct timer_list *timer) { }
 static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
+static inline void debug_timer_assert_init(struct timer_list *timer) { }
 #endif
 
 static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
        trace_timer_cancel(timer);
 }
 
+static inline void debug_assert_init(struct timer_list *timer)
+{
+       debug_timer_assert_init(timer);
+}
+
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
 
+       debug_assert_init(timer);
+
        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
        unsigned long flags;
        int ret = -1;
 
+       debug_assert_init(timer);
+
        base = lock_timer_base(timer, &flags);
 
        if (base->running_timer == timer)
index f2bd275..91dc4bc 100644 (file)
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-       TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
+       TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+       TRACE_ITER_IRQ_INFO;
 
 static int trace_stop_count;
 static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
        "record-cmd",
        "overwrite",
        "disable_on_free",
+       "irq-info",
        NULL
 };
 
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
        trace_event_read_unlock();
 }
 
+static void
+get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+{
+       unsigned long count;
+       int cpu;
+
+       *total = 0;
+       *entries = 0;
+
+       for_each_tracing_cpu(cpu) {
+               count = ring_buffer_entries_cpu(tr->buffer, cpu);
+               /*
+                * If this buffer has skipped entries, then we hold all
+                * entries for the trace and we need to ignore the
+                * ones before the time stamp.
+                */
+               if (tr->data[cpu]->skipped_entries) {
+                       count -= tr->data[cpu]->skipped_entries;
+                       /* total is the same as the entries */
+                       *total += count;
+               } else
+                       *total += count +
+                               ring_buffer_overrun_cpu(tr->buffer, cpu);
+               *entries += count;
+       }
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
 
-static void print_func_help_header(struct seq_file *m)
+static void print_event_info(struct trace_array *tr, struct seq_file *m)
+{
+       unsigned long total;
+       unsigned long entries;
+
+       get_total_entries(tr, &total, &entries);
+       seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
+                  entries, total, num_online_cpus());
+       seq_puts(m, "#\n");
+}
+
+static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 {
-       seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n");
+       print_event_info(tr, m);
+       seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
        seq_puts(m, "#              | |       |          |         |\n");
 }
 
+static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+{
+       print_event_info(tr, m);
+       seq_puts(m, "#                              _-----=> irqs-off\n");
+       seq_puts(m, "#                             / _----=> need-resched\n");
+       seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
+       seq_puts(m, "#                            || / _--=> preempt-depth\n");
+       seq_puts(m, "#                            ||| /     delay\n");
+       seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
+       seq_puts(m, "#              | |       |   ||||       |         |\n");
+}
 
 void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        struct trace_array *tr = iter->tr;
        struct trace_array_cpu *data = tr->data[tr->cpu];
        struct tracer *type = current_trace;
-       unsigned long entries = 0;
-       unsigned long total = 0;
-       unsigned long count;
+       unsigned long entries;
+       unsigned long total;
        const char *name = "preemption";
-       int cpu;
 
        if (type)
                name = type->name;
 
-
-       for_each_tracing_cpu(cpu) {
-               count = ring_buffer_entries_cpu(tr->buffer, cpu);
-               /*
-                * If this buffer has skipped entries, then we hold all
-                * entries for the trace and we need to ignore the
-                * ones before the time stamp.
-                */
-               if (tr->data[cpu]->skipped_entries) {
-                       count -= tr->data[cpu]->skipped_entries;
-                       /* total is the same as the entries */
-                       total += count;
-               } else
-                       total += count +
-                               ring_buffer_overrun_cpu(tr->buffer, cpu);
-               entries += count;
-       }
+       get_total_entries(tr, &total, &entries);
 
        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
 
+void trace_latency_header(struct seq_file *m)
+{
+       struct trace_iterator *iter = m->private;
+
+       /* print nothing if the buffers are empty */
+       if (trace_empty(iter))
+               return;
+
+       if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+               print_trace_header(m, iter);
+
+       if (!(trace_flags & TRACE_ITER_VERBOSE))
+               print_lat_help_header(m);
+}
+
 void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
-               if (!(trace_flags & TRACE_ITER_VERBOSE))
-                       print_func_help_header(m);
+               if (!(trace_flags & TRACE_ITER_VERBOSE)) {
+                       if (trace_flags & TRACE_ITER_IRQ_INFO)
+                               print_func_help_header_irq(iter->tr, m);
+                       else
+                               print_func_help_header(iter->tr, m);
+               }
        }
 }
 
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
        __ftrace_dump(true, oops_dump_mode);
 }
+EXPORT_SYMBOL_GPL(ftrace_dump);
 
 __init static int tracer_alloc_buffers(void)
 {
index 092e1f8..2c26574 100644 (file)
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
        TRACE_ITER_STOP_ON_FREE         = 0x400000,
+       TRACE_ITER_IRQ_INFO             = 0x800000,
 };
 
 /*
index 95dc31e..f04cc31 100644 (file)
 #include "trace.h"
 #include "trace_output.h"
 
+#define DEFAULT_SYS_FILTER_MESSAGE                                     \
+       "### global filter ###\n"                                       \
+       "# Use this to set filters for multiple events.\n"              \
+       "# Only events with the given fields will be affected.\n"       \
+       "# If no events are modified, an error message will be displayed here"
+
 enum filter_op_ids
 {
        OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-               trace_seq_printf(s, "none\n");
+               trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
        mutex_unlock(&event_mutex);
 }
 
@@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!filter)
                goto out;
 
-       replace_filter_string(filter, filter_string);
+       /* System filters just show a default message */
+       kfree(filter->filter_string);
+       filter->filter_string = NULL;
+
        /*
         * No event actually uses the system filter
         * we can free it without synchronize_sched().
@@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
-       if (err) {
-               append_filter_err(ps, system->filter);
-               goto out;
-       }
+       if (err)
+               goto err_filter;
 
        err = replace_system_preds(system, ps, filter_string);
        if (err)
-               append_filter_err(ps, system->filter);
+               goto err_filter;
 
 out:
        filter_opstack_clear(ps);
@@ -1865,6 +1872,11 @@ out_unlock:
        mutex_unlock(&event_mutex);
 
        return err;
+
+err_filter:
+       replace_filter_string(filter, filter_string);
+       append_filter_err(ps, system->filter);
+       goto out;
 }
 
 #ifdef CONFIG_PERF_EVENTS
index 20dad0d..99d20e9 100644 (file)
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 }
 
 static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
-static void irqsoff_print_header(struct seq_file *s) { }
 static void irqsoff_trace_open(struct trace_iterator *iter) { }
 static void irqsoff_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_print_header(struct seq_file *s)
+{
+       trace_default_header(s);
+}
+#else
+static void irqsoff_print_header(struct seq_file *s)
+{
+       trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
index 5199930..0d6ff35 100644 (file)
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
        unsigned long usec_rem = do_div(t, USEC_PER_SEC);
        unsigned long secs = (unsigned long)t;
        char comm[TASK_COMM_LEN];
+       int ret;
 
        trace_find_cmdline(entry->pid, comm);
 
-       return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-                               comm, entry->pid, iter->cpu, secs, usec_rem);
+       ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+                              comm, entry->pid, iter->cpu);
+       if (!ret)
+               return 0;
+
+       if (trace_flags & TRACE_ITER_IRQ_INFO) {
+               ret = trace_print_lat_fmt(s, entry);
+               if (!ret)
+                       return 0;
+       }
+
+       return trace_seq_printf(s, " %5lu.%06lu: ",
+                               secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
index e4a70c0..ff791ea 100644 (file)
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 }
 
 static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
-static void wakeup_print_header(struct seq_file *s) { }
 static void wakeup_trace_open(struct trace_iterator *iter) { }
 static void wakeup_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void wakeup_print_header(struct seq_file *s)
+{
+       trace_default_header(s);
+}
+#else
+static void wakeup_print_header(struct seq_file *s)
+{
+       trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 /*
index 5bbfac8..23b4d78 100644 (file)
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
 
                local_irq_save(flags);
                time = tsk->stime + tsk->utime;
-               dtime = cputime_sub(time, tsk->acct_timexpd);
+               dtime = time - tsk->acct_timexpd;
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
                delta = delta * USEC_PER_SEC + value.tv_usec;
index 26fa779..7fdd9ea 100644 (file)
 #include <linux/wait.h>
 #include <linux/hash.h>
 
-void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
 {
        spin_lock_init(&q->lock);
-       lockdep_set_class(&q->lock, key);
+       lockdep_set_class_and_name(&q->lock, key, name);
        INIT_LIST_HEAD(&q->task_list);
 }
 
index a78b7c6..77cb245 100644 (file)
@@ -268,12 +268,16 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
  * Try to repair the damage, so we have a better chance to get useful
  * debug output.
  */
-static void
+static int
 debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state),
                   void * addr, enum debug_obj_state state)
 {
+       int fixed = 0;
+
        if (fixup)
-               debug_objects_fixups += fixup(addr, state);
+               fixed = fixup(addr, state);
+       debug_objects_fixups += fixed;
+       return fixed;
 }
 
 static void debug_object_is_on_stack(void *addr, int onstack)
@@ -386,6 +390,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;
+       struct debug_obj o = { .object = addr,
+                              .state = ODEBUG_STATE_NOTAVAILABLE,
+                              .descr = descr };
 
        if (!debug_objects_enabled)
                return;
@@ -425,8 +432,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr)
         * let the type specific code decide whether this is
         * true or not.
         */
-       debug_object_fixup(descr->fixup_activate, addr,
-                          ODEBUG_STATE_NOTAVAILABLE);
+       if (debug_object_fixup(descr->fixup_activate, addr,
+                          ODEBUG_STATE_NOTAVAILABLE))
+               debug_print_object(&o, "activate");
 }
 
 /**
@@ -562,6 +570,44 @@ out_unlock:
        raw_spin_unlock_irqrestore(&db->lock, flags);
 }
 
+/**
+ * debug_object_assert_init - debug checks when object should be init-ed
+ * @addr:      address of the object
+ * @descr:     pointer to an object specific debug description structure
+ */
+void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
+{
+       struct debug_bucket *db;
+       struct debug_obj *obj;
+       unsigned long flags;
+
+       if (!debug_objects_enabled)
+               return;
+
+       db = get_bucket((unsigned long) addr);
+
+       raw_spin_lock_irqsave(&db->lock, flags);
+
+       obj = lookup_object(addr, db);
+       if (!obj) {
+               struct debug_obj o = { .object = addr,
+                                      .state = ODEBUG_STATE_NOTAVAILABLE,
+                                      .descr = descr };
+
+               raw_spin_unlock_irqrestore(&db->lock, flags);
+               /*
+                * Maybe the object is static.  Let the type specific
+                * code decide what to do.
+                */
+               if (debug_object_fixup(descr->fixup_assert_init, addr,
+                                      ODEBUG_STATE_NOTAVAILABLE))
+                       debug_print_object(&o, "assert_init");
+               return;
+       }
+
+       raw_spin_unlock_irqrestore(&db->lock, flags);
+}
+
 /**
  * debug_object_active_state - debug checks object usage state machine
  * @addr:      address of the object
index 011b110..e338407 100644 (file)
@@ -131,6 +131,12 @@ config SPARSEMEM_VMEMMAP
 config HAVE_MEMBLOCK
        boolean
 
+config HAVE_MEMBLOCK_NODE_MAP
+       boolean
+
+config ARCH_DISCARD_MEMBLOCK
+       boolean
+
 config NO_BOOTMEM
        boolean
 
index 84bec49..2f55f19 100644 (file)
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
 
-struct memblock memblock __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+
+struct memblock memblock __initdata_memblock = {
+       .memory.regions         = memblock_memory_init_regions,
+       .memory.cnt             = 1,    /* empty dummy entry */
+       .memory.max             = INIT_MEMBLOCK_REGIONS,
+
+       .reserved.regions       = memblock_reserved_init_regions,
+       .reserved.cnt           = 1,    /* empty dummy entry */
+       .reserved.max           = INIT_MEMBLOCK_REGIONS,
+
+       .current_limit          = MEMBLOCK_ALLOC_ANYWHERE,
+};
 
 int memblock_debug __initdata_memblock;
-int memblock_can_resize __initdata_memblock;
-static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static int memblock_can_resize __initdata_memblock;
 
 /* inline so we don't get a warning when pr_debug is compiled out */
 static inline const char *memblock_type_name(struct memblock_type *type)
@@ -38,20 +49,15 @@ static inline const char *memblock_type_name(struct memblock_type *type)
                return "unknown";
 }
 
-/*
- * Address comparison utilities
- */
-
-static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
-{
-       return addr & ~(size - 1);
-}
-
-static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
+static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
 {
-       return (addr + (size - 1)) & ~(size - 1);
+       return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
 }
 
+/*
+ * Address comparison utilities
+ */
 static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
                                       phys_addr_t base2, phys_addr_t size2)
 {
@@ -73,83 +79,66 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
        return (i < type->cnt) ? i : -1;
 }
 
-/*
- * Find, allocate, deallocate or reserve unreserved regions. All allocations
- * are top-down.
+/**
+ * memblock_find_in_range_node - find free area in given range and node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Find @size free area aligned to @align in the specified range and node.
+ *
+ * RETURNS:
+ * Found address on success, %0 on failure.
  */
-
-static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
-                                         phys_addr_t size, phys_addr_t align)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
+                                       phys_addr_t end, phys_addr_t size,
+                                       phys_addr_t align, int nid)
 {
-       phys_addr_t base, res_base;
-       long j;
-
-       /* In case, huge size is requested */
-       if (end < size)
-               return MEMBLOCK_ERROR;
-
-       base = memblock_align_down((end - size), align);
+       phys_addr_t this_start, this_end, cand;
+       u64 i;
 
-       /* Prevent allocations returning 0 as it's also used to
-        * indicate an allocation failure
-        */
-       if (start == 0)
-               start = PAGE_SIZE;
-
-       while (start <= base) {
-               j = memblock_overlaps_region(&memblock.reserved, base, size);
-               if (j < 0)
-                       return base;
-               res_base = memblock.reserved.regions[j].base;
-               if (res_base < size)
-                       break;
-               base = memblock_align_down(res_base - size, align);
-       }
+       /* align @size to avoid excessive fragmentation on reserved array */
+       size = round_up(size, align);
 
-       return MEMBLOCK_ERROR;
-}
-
-static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
-                       phys_addr_t align, phys_addr_t start, phys_addr_t end)
-{
-       long i;
-
-       BUG_ON(0 == size);
-
-       /* Pump up max_addr */
+       /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
 
-       /* We do a top-down search, this tends to limit memory
-        * fragmentation by keeping early boot allocs near the
-        * top of memory
-        */
-       for (i = memblock.memory.cnt - 1; i >= 0; i--) {
-               phys_addr_t memblockbase = memblock.memory.regions[i].base;
-               phys_addr_t memblocksize = memblock.memory.regions[i].size;
-               phys_addr_t bottom, top, found;
+       /* adjust @start to avoid underflow and allocating the first page */
+       start = max3(start, size, (phys_addr_t)PAGE_SIZE);
+       end = max(start, end);
 
-               if (memblocksize < size)
-                       continue;
-               if ((memblockbase + memblocksize) <= start)
-                       break;
-               bottom = max(memblockbase, start);
-               top = min(memblockbase + memblocksize, end);
-               if (bottom >= top)
-                       continue;
-               found = memblock_find_region(bottom, top, size, align);
-               if (found != MEMBLOCK_ERROR)
-                       return found;
+       for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+               this_start = clamp(this_start, start, end);
+               this_end = clamp(this_end, start, end);
+
+               cand = round_down(this_end - size, align);
+               if (cand >= this_start)
+                       return cand;
        }
-       return MEMBLOCK_ERROR;
+       return 0;
 }
 
-/*
- * Find a free area with specified alignment in a specific range.
+/**
+ * memblock_find_in_range - find free area in given range
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @size: size of free area to find
+ * @align: alignment of free area to find
+ *
+ * Find @size free area aligned to @align in the specified range.
+ *
+ * RETURNS:
+ * Found address on success, %0 on failure.
  */
-u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
+phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
+                                       phys_addr_t end, phys_addr_t size,
+                                       phys_addr_t align)
 {
-       return memblock_find_base(size, align, start, end);
+       return memblock_find_in_range_node(start, end, size, align,
+                                          MAX_NUMNODES);
 }
 
 /*
@@ -178,25 +167,21 @@ int __init_memblock memblock_reserve_reserved_regions(void)
 
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
-       unsigned long i;
-
-       for (i = r; i < type->cnt - 1; i++) {
-               type->regions[i].base = type->regions[i + 1].base;
-               type->regions[i].size = type->regions[i + 1].size;
-       }
+       type->total_size -= type->regions[r].size;
+       memmove(&type->regions[r], &type->regions[r + 1],
+               (type->cnt - (r + 1)) * sizeof(type->regions[r]));
        type->cnt--;
 
        /* Special case for empty arrays */
        if (type->cnt == 0) {
+               WARN_ON(type->total_size != 0);
                type->cnt = 1;
                type->regions[0].base = 0;
                type->regions[0].size = 0;
+               memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
        }
 }
 
-/* Defined below but needed now */
-static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
-
 static int __init_memblock memblock_double_array(struct memblock_type *type)
 {
        struct memblock_region *new_array, *old_array;
@@ -226,10 +211,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
         */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
-               addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+               addr = new_array ? __pa(new_array) : 0;
        } else
-               addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
-       if (addr == MEMBLOCK_ERROR) {
+               addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+       if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
                       memblock_type_name(type), type->max, type->max * 2);
                return -1;
@@ -254,7 +239,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                return 0;
 
        /* Add the new reserved region now. Should not fail ! */
-       BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
+       BUG_ON(memblock_reserve(addr, new_size));
 
        /* If the array wasn't our static init one, then free it. We only do
         * that before SLAB is available as later on, we don't know whether
@@ -268,343 +253,514 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        return 0;
 }
 
-int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
-                                         phys_addr_t addr2, phys_addr_t size2)
-{
-       return 1;
-}
-
-static long __init_memblock memblock_add_region(struct memblock_type *type,
-                                               phys_addr_t base, phys_addr_t size)
+/**
+ * memblock_merge_regions - merge neighboring compatible regions
+ * @type: memblock type to scan
+ *
+ * Scan @type and merge neighboring compatible regions.
+ */
+static void __init_memblock memblock_merge_regions(struct memblock_type *type)
 {
-       phys_addr_t end = base + size;
-       int i, slot = -1;
-
-       /* First try and coalesce this MEMBLOCK with others */
-       for (i = 0; i < type->cnt; i++) {
-               struct memblock_region *rgn = &type->regions[i];
-               phys_addr_t rend = rgn->base + rgn->size;
+       int i = 0;
 
-               /* Exit if there's no possible hits */
-               if (rgn->base > end || rgn->size == 0)
-                       break;
+       /* cnt never goes below 1 */
+       while (i < type->cnt - 1) {
+               struct memblock_region *this = &type->regions[i];
+               struct memblock_region *next = &type->regions[i + 1];
 
-               /* Check if we are fully enclosed within an existing
-                * block
-                */
-               if (rgn->base <= base && rend >= end)
-                       return 0;
+               if (this->base + this->size != next->base ||
+                   memblock_get_region_node(this) !=
+                   memblock_get_region_node(next)) {
+                       BUG_ON(this->base + this->size > next->base);
+                       i++;
+                       continue;
+               }
 
-               /* Check if we overlap or are adjacent with the bottom
-                * of a block.
-                */
-               if (base < rgn->base && end >= rgn->base) {
-                       /* If we can't coalesce, create a new block */
-                       if (!memblock_memory_can_coalesce(base, size,
-                                                         rgn->base,
-                                                         rgn->size)) {
-                               /* Overlap & can't coalesce are mutually
-                                * exclusive, if you do that, be prepared
-                                * for trouble
-                                */
-                               WARN_ON(end != rgn->base);
-                               goto new_block;
-                       }
-                       /* We extend the bottom of the block down to our
-                        * base
-                        */
-                       rgn->base = base;
-                       rgn->size = rend - base;
+               this->size += next->size;
+               memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next));
+               type->cnt--;
+       }
+}
 
-                       /* Return if we have nothing else to allocate
-                        * (fully coalesced)
-                        */
-                       if (rend >= end)
-                               return 0;
+/**
+ * memblock_insert_region - insert new memblock region
+ * @type: memblock type to insert into
+ * @idx: index for the insertion point
+ * @base: base address of the new region
+ * @size: size of the new region
+ *
+ * Insert new memblock region [@base,@base+@size) into @type at @idx.
+ * @type must already have extra room to accomodate the new region.
+ */
+static void __init_memblock memblock_insert_region(struct memblock_type *type,
+                                                  int idx, phys_addr_t base,
+                                                  phys_addr_t size, int nid)
+{
+       struct memblock_region *rgn = &type->regions[idx];
 
-                       /* We continue processing from the end of the
-                        * coalesced block.
-                        */
-                       base = rend;
-                       size = end - base;
-               }
+       BUG_ON(type->cnt >= type->max);
+       memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
+       rgn->base = base;
+       rgn->size = size;
+       memblock_set_region_node(rgn, nid);
+       type->cnt++;
+       type->total_size += size;
+}
 
-               /* Now check if we overlap or are adjacent with the
-                * top of a block
-                */
-               if (base <= rend && end >= rend) {
-                       /* If we can't coalesce, create a new block */
-                       if (!memblock_memory_can_coalesce(rgn->base,
-                                                         rgn->size,
-                                                         base, size)) {
-                               /* Overlap & can't coalesce are mutually
-                                * exclusive, if you do that, be prepared
-                                * for trouble
-                                */
-                               WARN_ON(rend != base);
-                               goto new_block;
-                       }
-                       /* We adjust our base down to enclose the
-                        * original block and destroy it. It will be
-                        * part of our new allocation. Since we've
-                        * freed an entry, we know we won't fail
-                        * to allocate one later, so we won't risk
-                        * losing the original block allocation.
-                        */
-                       size += (base - rgn->base);
-                       base = rgn->base;
-                       memblock_remove_region(type, i--);
-               }
-       }
+/**
+ * memblock_add_region - add new memblock region
+ * @type: memblock type to add new region into
+ * @base: base address of the new region
+ * @size: size of the new region
+ * @nid: nid of the new region
+ *
+ * Add new memblock region [@base,@base+@size) into @type.  The new region
+ * is allowed to overlap with existing ones - overlaps don't affect already
+ * existing regions.  @type is guaranteed to be minimal (all neighbouring
+ * compatible regions are merged) after the addition.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_add_region(struct memblock_type *type,
+                               phys_addr_t base, phys_addr_t size, int nid)
+{
+       bool insert = false;
+       phys_addr_t obase = base;
+       phys_addr_t end = base + memblock_cap_size(base, &size);
+       int i, nr_new;
 
-       /* If the array is empty, special case, replace the fake
-        * filler region and return
-        */
-       if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+       /* special case for empty array */
+       if (type->regions[0].size == 0) {
+               WARN_ON(type->cnt != 1 || type->total_size);
                type->regions[0].base = base;
                type->regions[0].size = size;
+               memblock_set_region_node(&type->regions[0], nid);
+               type->total_size = size;
                return 0;
        }
-
- new_block:
-       /* If we are out of space, we fail. It's too late to resize the array
-        * but then this shouldn't have happened in the first place.
+repeat:
+       /*
+        * The following is executed twice.  Once with %false @insert and
+        * then with %true.  The first counts the number of regions needed
+        * to accomodate the new area.  The second actually inserts them.
         */
-       if (WARN_ON(type->cnt >= type->max))
-               return -1;
+       base = obase;
+       nr_new = 0;
 
-       /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
-       for (i = type->cnt - 1; i >= 0; i--) {
-               if (base < type->regions[i].base) {
-                       type->regions[i+1].base = type->regions[i].base;
-                       type->regions[i+1].size = type->regions[i].size;
-               } else {
-                       type->regions[i+1].base = base;
-                       type->regions[i+1].size = size;
-                       slot = i + 1;
+       for (i = 0; i < type->cnt; i++) {
+               struct memblock_region *rgn = &type->regions[i];
+               phys_addr_t rbase = rgn->base;
+               phys_addr_t rend = rbase + rgn->size;
+
+               if (rbase >= end)
                        break;
+               if (rend <= base)
+                       continue;
+               /*
+                * @rgn overlaps.  If it separates the lower part of new
+                * area, insert that portion.
+                */
+               if (rbase > base) {
+                       nr_new++;
+                       if (insert)
+                               memblock_insert_region(type, i++, base,
+                                                      rbase - base, nid);
                }
+               /* area below @rend is dealt with, forget about it */
+               base = min(rend, end);
        }
-       if (base < type->regions[0].base) {
-               type->regions[0].base = base;
-               type->regions[0].size = size;
-               slot = 0;
+
+       /* insert the remaining portion */
+       if (base < end) {
+               nr_new++;
+               if (insert)
+                       memblock_insert_region(type, i, base, end - base, nid);
        }
-       type->cnt++;
 
-       /* The array is full ? Try to resize it. If that fails, we undo
-        * our allocation and return an error
+       /*
+        * If this was the first round, resize array and repeat for actual
+        * insertions; otherwise, merge and return.
         */
-       if (type->cnt == type->max && memblock_double_array(type)) {
-               BUG_ON(slot < 0);
-               memblock_remove_region(type, slot);
-               return -1;
+       if (!insert) {
+               while (type->cnt + nr_new > type->max)
+                       if (memblock_double_array(type) < 0)
+                               return -ENOMEM;
+               insert = true;
+               goto repeat;
+       } else {
+               memblock_merge_regions(type);
+               return 0;
        }
-
-       return 0;
 }
 
-long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
+                                      int nid)
 {
-       return memblock_add_region(&memblock.memory, base, size);
+       return memblock_add_region(&memblock.memory, base, size, nid);
+}
 
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
+{
+       return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
 }
 
-static long __init_memblock __memblock_remove(struct memblock_type *type,
-                                             phys_addr_t base, phys_addr_t size)
+/**
+ * memblock_isolate_range - isolate given range into disjoint memblocks
+ * @type: memblock type to isolate range for
+ * @base: base of range to isolate
+ * @size: size of range to isolate
+ * @start_rgn: out parameter for the start of isolated region
+ * @end_rgn: out parameter for the end of isolated region
+ *
+ * Walk @type and ensure that regions don't cross the boundaries defined by
+ * [@base,@base+@size).  Crossing regions are split at the boundaries,
+ * which may create at most two more regions.  The index of the first
+ * region inside the range is returned in *@start_rgn and end in *@end_rgn.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int __init_memblock memblock_isolate_range(struct memblock_type *type,
+                                       phys_addr_t base, phys_addr_t size,
+                                       int *start_rgn, int *end_rgn)
 {
-       phys_addr_t end = base + size;
+       phys_addr_t end = base + memblock_cap_size(base, &size);
        int i;
 
-       /* Walk through the array for collisions */
+       *start_rgn = *end_rgn = 0;
+
+       /* we'll create at most two more regions */
+       while (type->cnt + 2 > type->max)
+               if (memblock_double_array(type) < 0)
+                       return -ENOMEM;
+
        for (i = 0; i < type->cnt; i++) {
                struct memblock_region *rgn = &type->regions[i];
-               phys_addr_t rend = rgn->base + rgn->size;
+               phys_addr_t rbase = rgn->base;
+               phys_addr_t rend = rbase + rgn->size;
 
-               /* Nothing more to do, exit */
-               if (rgn->base > end || rgn->size == 0)
+               if (rbase >= end)
                        break;
-
-               /* If we fully enclose the block, drop it */
-               if (base <= rgn->base && end >= rend) {
-                       memblock_remove_region(type, i--);
+               if (rend <= base)
                        continue;
-               }
 
-               /* If we are fully enclosed within a block
-                * then we need to split it and we are done
-                */
-               if (base > rgn->base && end < rend) {
-                       rgn->size = base - rgn->base;
-                       if (!memblock_add_region(type, end, rend - end))
-                               return 0;
-                       /* Failure to split is bad, we at least
-                        * restore the block before erroring
+               if (rbase < base) {
+                       /*
+                        * @rgn intersects from below.  Split and continue
+                        * to process the next region - the new top half.
+                        */
+                       rgn->base = base;
+                       rgn->size -= base - rbase;
+                       type->total_size -= base - rbase;
+                       memblock_insert_region(type, i, rbase, base - rbase,
+                                              memblock_get_region_node(rgn));
+               } else if (rend > end) {
+                       /*
+                        * @rgn intersects from above.  Split and redo the
+                        * current region - the new bottom half.
                         */
-                       rgn->size = rend - rgn->base;
-                       WARN_ON(1);
-                       return -1;
-               }
-
-               /* Check if we need to trim the bottom of a block */
-               if (rgn->base < end && rend > end) {
-                       rgn->size -= end - rgn->base;
                        rgn->base = end;
-                       break;
+                       rgn->size -= end - rbase;
+                       type->total_size -= end - rbase;
+                       memblock_insert_region(type, i--, rbase, end - rbase,
+                                              memblock_get_region_node(rgn));
+               } else {
+                       /* @rgn is fully contained, record it */
+                       if (!*end_rgn)
+                               *start_rgn = i;
+                       *end_rgn = i + 1;
                }
+       }
 
-               /* And check if we need to trim the top of a block */
-               if (base < rend)
-                       rgn->size -= rend - base;
+       return 0;
+}
 
-       }
+static int __init_memblock __memblock_remove(struct memblock_type *type,
+                                            phys_addr_t base, phys_addr_t size)
+{
+       int start_rgn, end_rgn;
+       int i, ret;
+
+       ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+       if (ret)
+               return ret;
+
+       for (i = end_rgn - 1; i >= start_rgn; i--)
+               memblock_remove_region(type, i);
        return 0;
 }
 
-long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 {
        return __memblock_remove(&memblock.memory, base, size);
 }
 
-long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
+       memblock_dbg("   memblock_free: [%#016llx-%#016llx] %pF\n",
+                    (unsigned long long)base,
+                    (unsigned long long)base + size,
+                    (void *)_RET_IP_);
+
        return __memblock_remove(&memblock.reserved, base, size);
 }
 
-long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 {
        struct memblock_type *_rgn = &memblock.reserved;
 
+       memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+                    (unsigned long long)base,
+                    (unsigned long long)base + size,
+                    (void *)_RET_IP_);
        BUG_ON(0 == size);
 
-       return memblock_add_region(_rgn, base, size);
+       return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
 }
 
-phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+/**
+ * __next_free_mem_range - next function for for_each_free_mem_range()
+ * @idx: pointer to u64 loop variable
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Find the first free area from *@idx which matches @nid, fill the out
+ * parameters, and update *@idx for the next iteration.  The lower 32bit of
+ * *@idx contains index into memory region and the upper 32bit indexes the
+ * areas before each reserved region.  For example, if reserved regions
+ * look like the following,
+ *
+ *     0:[0-16), 1:[32-48), 2:[128-130)
+ *
+ * The upper 32bit indexes the following regions.
+ *
+ *     0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
+ *
+ * As both region arrays are sorted, the function advances the two indices
+ * in lockstep and returns each intersection.
+ */
+void __init_memblock __next_free_mem_range(u64 *idx, int nid,
+                                          phys_addr_t *out_start,
+                                          phys_addr_t *out_end, int *out_nid)
 {
-       phys_addr_t found;
+       struct memblock_type *mem = &memblock.memory;
+       struct memblock_type *rsv = &memblock.reserved;
+       int mi = *idx & 0xffffffff;
+       int ri = *idx >> 32;
 
-       /* We align the size to limit fragmentation. Without this, a lot of
-        * small allocs quickly eat up the whole reserve array on sparc
-        */
-       size = memblock_align_up(size, align);
+       for ( ; mi < mem->cnt; mi++) {
+               struct memblock_region *m = &mem->regions[mi];
+               phys_addr_t m_start = m->base;
+               phys_addr_t m_end = m->base + m->size;
 
-       found = memblock_find_base(size, align, 0, max_addr);
-       if (found != MEMBLOCK_ERROR &&
-           !memblock_add_region(&memblock.reserved, found, size))
-               return found;
+               /* only memory regions are associated with nodes, check it */
+               if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                       continue;
 
-       return 0;
+               /* scan areas before each reservation for intersection */
+               for ( ; ri < rsv->cnt + 1; ri++) {
+                       struct memblock_region *r = &rsv->regions[ri];
+                       phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+                       phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+
+                       /* if ri advanced past mi, break out to advance mi */
+                       if (r_start >= m_end)
+                               break;
+                       /* if the two regions intersect, we're done */
+                       if (m_start < r_end) {
+                               if (out_start)
+                                       *out_start = max(m_start, r_start);
+                               if (out_end)
+                                       *out_end = min(m_end, r_end);
+                               if (out_nid)
+                                       *out_nid = memblock_get_region_node(m);
+                               /*
+                                * The region which ends first is advanced
+                                * for the next iteration.
+                                */
+                               if (m_end <= r_end)
+                                       mi++;
+                               else
+                                       ri++;
+                               *idx = (u32)mi | (u64)ri << 32;
+                               return;
+                       }
+               }
+       }
+
+       /* signal end of iteration */
+       *idx = ULLONG_MAX;
 }
 
-phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+/**
+ * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
+ * @idx: pointer to u64 loop variable
+ * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @p_nid: ptr to int for nid of the range, can be %NULL
+ *
+ * Reverse of __next_free_mem_range().
+ */
+void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
+                                          phys_addr_t *out_start,
+                                          phys_addr_t *out_end, int *out_nid)
 {
-       phys_addr_t alloc;
+       struct memblock_type *mem = &memblock.memory;
+       struct memblock_type *rsv = &memblock.reserved;
+       int mi = *idx & 0xffffffff;
+       int ri = *idx >> 32;
 
-       alloc = __memblock_alloc_base(size, align, max_addr);
+       if (*idx == (u64)ULLONG_MAX) {
+               mi = mem->cnt - 1;
+               ri = rsv->cnt;
+       }
 
-       if (alloc == 0)
-               panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
-                     (unsigned long long) size, (unsigned long long) max_addr);
+       for ( ; mi >= 0; mi--) {
+               struct memblock_region *m = &mem->regions[mi];
+               phys_addr_t m_start = m->base;
+               phys_addr_t m_end = m->base + m->size;
 
-       return alloc;
-}
+               /* only memory regions are associated with nodes, check it */
+               if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+                       continue;
 
-phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
-{
-       return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
-}
+               /* scan areas before each reservation for intersection */
+               for ( ; ri >= 0; ri--) {
+                       struct memblock_region *r = &rsv->regions[ri];
+                       phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
+                       phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+
+                       /* if ri advanced past mi, break out to advance mi */
+                       if (r_end <= m_start)
+                               break;
+                       /* if the two regions intersect, we're done */
+                       if (m_end > r_start) {
+                               if (out_start)
+                                       *out_start = max(m_start, r_start);
+                               if (out_end)
+                                       *out_end = min(m_end, r_end);
+                               if (out_nid)
+                                       *out_nid = memblock_get_region_node(m);
+
+                               if (m_start >= r_start)
+                                       mi--;
+                               else
+                                       ri--;
+                               *idx = (u32)mi | (u64)ri << 32;
+                               return;
+                       }
+               }
+       }
 
+       *idx = ULLONG_MAX;
+}
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
- * Additional node-local allocators. Search for node memory is bottom up
- * and walks memblock regions within that node bottom-up as well, but allocation
- * within an memblock region is top-down. XXX I plan to fix that at some stage
- *
- * WARNING: Only available after early_node_map[] has been populated,
- * on some architectures, that is after all the calls to add_active_range()
- * have been done to populate it.
+ * Common iterator interface used to define for_each_mem_range().
  */
-
-phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
+void __init_memblock __next_mem_pfn_range(int *idx, int nid,
+                               unsigned long *out_start_pfn,
+                               unsigned long *out_end_pfn, int *out_nid)
 {
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-       /*
-        * This code originates from sparc which really wants use to walk by addresses
-        * and returns the nid. This is not very convenient for early_pfn_map[] users
-        * as the map isn't sorted yet, and it really wants to be walked by nid.
-        *
-        * For now, I implement the inefficient method below which walks the early
-        * map multiple times. Eventually we may want to use an ARCH config option
-        * to implement a completely different method for both case.
-        */
-       unsigned long start_pfn, end_pfn;
-       int i;
+       struct memblock_type *type = &memblock.memory;
+       struct memblock_region *r;
 
-       for (i = 0; i < MAX_NUMNODES; i++) {
-               get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
-               if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+       while (++*idx < type->cnt) {
+               r = &type->regions[*idx];
+
+               if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
                        continue;
-               *nid = i;
-               return min(end, PFN_PHYS(end_pfn));
+               if (nid == MAX_NUMNODES || nid == r->nid)
+                       break;
+       }
+       if (*idx >= type->cnt) {
+               *idx = -1;
+               return;
        }
-#endif
-       *nid = 0;
 
-       return end;
+       if (out_start_pfn)
+               *out_start_pfn = PFN_UP(r->base);
+       if (out_end_pfn)
+               *out_end_pfn = PFN_DOWN(r->base + r->size);
+       if (out_nid)
+               *out_nid = r->nid;
 }
 
-static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
-                                              phys_addr_t size,
-                                              phys_addr_t align, int nid)
+/**
+ * memblock_set_node - set node ID on memblock regions
+ * @base: base of area to set node ID for
+ * @size: size of area to set node ID for
+ * @nid: node ID to set
+ *
+ * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Regions which cross the area boundaries are split as necessary.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
+                                     int nid)
 {
-       phys_addr_t start, end;
+       struct memblock_type *type = &memblock.memory;
+       int start_rgn, end_rgn;
+       int i, ret;
 
-       start = mp->base;
-       end = start + mp->size;
+       ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+       if (ret)
+               return ret;
 
-       start = memblock_align_up(start, align);
-       while (start < end) {
-               phys_addr_t this_end;
-               int this_nid;
+       for (i = start_rgn; i < end_rgn; i++)
+               type->regions[i].nid = nid;
 
-               this_end = memblock_nid_range(start, end, &this_nid);
-               if (this_nid == nid) {
-                       phys_addr_t ret = memblock_find_region(start, this_end, size, align);
-                       if (ret != MEMBLOCK_ERROR &&
-                           !memblock_add_region(&memblock.reserved, ret, size))
-                               return ret;
-               }
-               start = this_end;
-       }
+       memblock_merge_regions(type);
+       return 0;
+}
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+                                       phys_addr_t align, phys_addr_t max_addr,
+                                       int nid)
+{
+       phys_addr_t found;
 
-       return MEMBLOCK_ERROR;
+       found = memblock_find_in_range_node(0, max_addr, size, align, nid);
+       if (found && !memblock_reserve(found, size))
+               return found;
+
+       return 0;
 }
 
 phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-       struct memblock_type *mem = &memblock.memory;
-       int i;
+       return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
 
-       BUG_ON(0 == size);
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+       return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+}
 
-       /* We align the size to limit fragmentation. Without this, a lot of
-        * small allocs quickly eat up the whole reserve array on sparc
-        */
-       size = memblock_align_up(size, align);
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
+{
+       phys_addr_t alloc;
 
-       /* We do a bottom-up search for a region with the right
-        * nid since that's easier considering how memblock_nid_range()
-        * works
-        */
-       for (i = 0; i < mem->cnt; i++) {
-               phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
-                                              size, align, nid);
-               if (ret != MEMBLOCK_ERROR)
-                       return ret;
-       }
+       alloc = __memblock_alloc_base(size, align, max_addr);
 
-       return 0;
+       if (alloc == 0)
+               panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+                     (unsigned long long) size, (unsigned long long) max_addr);
+
+       return alloc;
+}
+
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
+{
+       return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
 phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
@@ -613,7 +769,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
 
        if (res)
                return res;
-       return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+       return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
 
@@ -621,10 +777,9 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
  * Remaining API functions
  */
 
-/* You must call memblock_analyze() before this. */
 phys_addr_t __init memblock_phys_mem_size(void)
 {
-       return memblock.memory_size;
+       return memblock.memory.total_size;
 }
 
 /* lowest address */
@@ -640,45 +795,28 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
        return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
 }
 
-/* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t limit)
 {
        unsigned long i;
-       phys_addr_t limit;
-       struct memblock_region *p;
+       phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
 
-       if (!memory_limit)
+       if (!limit)
                return;
 
-       /* Truncate the memblock regions to satisfy the memory limit. */
-       limit = memory_limit;
+       /* find out max address */
        for (i = 0; i < memblock.memory.cnt; i++) {
-               if (limit > memblock.memory.regions[i].size) {
-                       limit -= memblock.memory.regions[i].size;
-                       continue;
-               }
-
-               memblock.memory.regions[i].size = limit;
-               memblock.memory.cnt = i + 1;
-               break;
-       }
-
-       memory_limit = memblock_end_of_DRAM();
+               struct memblock_region *r = &memblock.memory.regions[i];
 
-       /* And truncate any reserves above the limit also. */
-       for (i = 0; i < memblock.reserved.cnt; i++) {
-               p = &memblock.reserved.regions[i];
-
-               if (p->base > memory_limit)
-                       p->size = 0;
-               else if ((p->base + p->size) > memory_limit)
-                       p->size = memory_limit - p->base;
-
-               if (p->size == 0) {
-                       memblock_remove_region(&memblock.reserved, i);
-                       i--;
+               if (limit <= r->size) {
+                       max_addr = r->base + limit;
+                       break;
                }
+               limit -= r->size;
        }
+
+       /* truncate both memory and reserved regions */
+       __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
+       __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
 }
 
 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -712,16 +850,18 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
        int idx = memblock_search(&memblock.memory, base);
+       phys_addr_t end = base + memblock_cap_size(base, &size);
 
        if (idx == -1)
                return 0;
        return memblock.memory.regions[idx].base <= base &&
                (memblock.memory.regions[idx].base +
-                memblock.memory.regions[idx].size) >= (base + size);
+                memblock.memory.regions[idx].size) >= end;
 }
 
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
+       memblock_cap_size(base, &size);
        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
 }
 
@@ -731,86 +871,45 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
        memblock.current_limit = limit;
 }
 
-static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
 {
        unsigned long long base, size;
        int i;
 
-       pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
+       pr_info(" %s.cnt  = 0x%lx\n", name, type->cnt);
 
-       for (i = 0; i < region->cnt; i++) {
-               base = region->regions[i].base;
-               size = region->regions[i].size;
-
-               pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
-                   name, i, base, base + size - 1, size);
+       for (i = 0; i < type->cnt; i++) {
+               struct memblock_region *rgn = &type->regions[i];
+               char nid_buf[32] = "";
+
+               base = rgn->base;
+               size = rgn->size;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+               if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+                       snprintf(nid_buf, sizeof(nid_buf), " on node %d",
+                                memblock_get_region_node(rgn));
+#endif
+               pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
+                       name, i, base, base + size - 1, size, nid_buf);
        }
 }
 
-void __init_memblock memblock_dump_all(void)
+void __init_memblock __memblock_dump_all(void)
 {
-       if (!memblock_debug)
-               return;
-
        pr_info("MEMBLOCK configuration:\n");
-       pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+       pr_info(" memory size = %#llx reserved size = %#llx\n",
+               (unsigned long long)memblock.memory.total_size,
+               (unsigned long long)memblock.reserved.total_size);
 
        memblock_dump(&memblock.memory, "memory");
        memblock_dump(&memblock.reserved, "reserved");
 }
 
-void __init memblock_analyze(void)
+void __init memblock_allow_resize(void)
 {
-       int i;
-
-       /* Check marker in the unused last array entry */
-       WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
-               != MEMBLOCK_INACTIVE);
-       WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
-               != MEMBLOCK_INACTIVE);
-
-       memblock.memory_size = 0;
-
-       for (i = 0; i < memblock.memory.cnt; i++)
-               memblock.memory_size += memblock.memory.regions[i].size;
-
-       /* We allow resizing from there */
        memblock_can_resize = 1;
 }
 
-void __init memblock_init(void)
-{
-       static int init_done __initdata = 0;
-
-       if (init_done)
-               return;
-       init_done = 1;
-
-       /* Hookup the initial arrays */
-       memblock.memory.regions = memblock_memory_init_regions;
-       memblock.memory.max             = INIT_MEMBLOCK_REGIONS;
-       memblock.reserved.regions       = memblock_reserved_init_regions;
-       memblock.reserved.max   = INIT_MEMBLOCK_REGIONS;
-
-       /* Write a marker in the unused last array entry */
-       memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
-       memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
-
-       /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
-        * This simplifies the memblock_add() code below...
-        */
-       memblock.memory.regions[0].base = 0;
-       memblock.memory.regions[0].size = 0;
-       memblock.memory.cnt = 1;
-
-       /* Ditto. */
-       memblock.reserved.regions[0].base = 0;
-       memblock.reserved.regions[0].size = 0;
-       memblock.reserved.cnt = 1;
-
-       memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
-}
-
 static int __init early_memblock(char *p)
 {
        if (p && strstr(p, "debug"))
@@ -819,7 +918,7 @@ static int __init early_memblock(char *p)
 }
 early_param("memblock", early_memblock);
 
-#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
 
 static int memblock_debug_show(struct seq_file *m, void *private)
 {
index 7fa41b4..24f0fc1 100644 (file)
@@ -41,14 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
        if (limit > memblock.current_limit)
                limit = memblock.current_limit;
 
-       addr = find_memory_core_early(nid, size, align, goal, limit);
-
-       if (addr == MEMBLOCK_ERROR)
+       addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+       if (!addr)
                return NULL;
 
        ptr = phys_to_virt(addr);
        memset(ptr, 0, size);
-       memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+       memblock_reserve(addr, size);
        /*
         * The min_count is set to 0 so that bootmem allocated blocks
         * are never reported as leaks.
@@ -107,23 +106,27 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
                __free_pages_bootmem(pfn_to_page(i), 0);
 }
 
-unsigned long __init free_all_memory_core_early(int nodeid)
+unsigned long __init free_low_memory_core_early(int nodeid)
 {
-       int i;
-       u64 start, end;
        unsigned long count = 0;
-       struct range *range = NULL;
-       int nr_range;
-
-       nr_range = get_free_all_memory_range(&range, nodeid);
-
-       for (i = 0; i < nr_range; i++) {
-               start = range[i].start;
-               end = range[i].end;
-               count += end - start;
-               __free_pages_memory(start, end);
+       phys_addr_t start, end;
+       u64 i;
+
+       /* free reserved array temporarily so that it's treated as free area */
+       memblock_free_reserved_regions();
+
+       for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+               unsigned long start_pfn = PFN_UP(start);
+               unsigned long end_pfn = min_t(unsigned long,
+                                             PFN_DOWN(end), max_low_pfn);
+               if (start_pfn < end_pfn) {
+                       __free_pages_memory(start_pfn, end_pfn);
+                       count += end_pfn - start_pfn;
+               }
        }
 
+       /* put region array back? */
+       memblock_reserve_reserved_regions();
        return count;
 }
 
@@ -137,7 +140,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
        register_page_bootmem_info_node(pgdat);
 
-       /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+       /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
        return 0;
 }
 
@@ -155,7 +158,7 @@ unsigned long __init free_all_bootmem(void)
         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
         *  will be used instead of only Node0 related
         */
-       return free_all_memory_core_early(MAX_NUMNODES);
+       return free_low_memory_core_early(MAX_NUMNODES);
 }
 
 /**
@@ -172,7 +175,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                              unsigned long size)
 {
        kmemleak_free_part(__va(physaddr), size);
-       memblock_x86_free_range(physaddr, physaddr + size);
+       memblock_free(physaddr, size);
 }
 
 /**
@@ -187,7 +190,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 void __init free_bootmem(unsigned long addr, unsigned long size)
 {
        kmemleak_free_part(__va(addr), size);
-       memblock_x86_free_range(addr, addr + size);
+       memblock_free(addr, size);
 }
 
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
index 2b8ba3a..bdc804c 100644 (file)
@@ -181,39 +181,17 @@ static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
 static unsigned long __meminitdata dma_reserve;
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-  /*
-   * MAX_ACTIVE_REGIONS determines the maximum number of distinct
-   * ranges of memory (RAM) that may be registered with add_active_range().
-   * Ranges passed to add_active_range() will be merged if possible
-   * so the number of times add_active_range() can be called is
-   * related to the number of nodes and the number of holes
-   */
-  #ifdef CONFIG_MAX_ACTIVE_REGIONS
-    /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
-    #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
-  #else
-    #if MAX_NUMNODES >= 32
-      /* If there can be many nodes, allow up to 50 holes per node */
-      #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
-    #else
-      /* By default, allow up to 256 distinct regions */
-      #define MAX_ACTIVE_REGIONS 256
-    #endif
-  #endif
-
-  static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
-  static int __meminitdata nr_nodemap_entries;
-  static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-  static unsigned long __initdata required_kernelcore;
-  static unsigned long __initdata required_movablecore;
-  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
-
-  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
-  int movable_zone;
-  EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore;
+static unsigned long __initdata required_movablecore;
+static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+
+/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
+int movable_zone;
+EXPORT_SYMBOL(movable_zone);
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 #if MAX_NUMNODES > 1
 int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -706,10 +684,10 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
                int loop;
 
                prefetchw(page);
-               for (loop = 0; loop < BITS_PER_LONG; loop++) {
+               for (loop = 0; loop < (1 << order); loop++) {
                        struct page *p = &page[loop];
 
-                       if (loop + 1 < BITS_PER_LONG)
+                       if (loop + 1 < (1 << order))
                                prefetchw(p + 1);
                        __ClearPageReserved(p);
                        set_page_count(p, 0);
@@ -3737,35 +3715,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        return 0;
 }
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-/*
- * Basic iterator support. Return the first range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns first region regardless of node
- */
-static int __meminit first_active_region_index_in_nid(int nid)
-{
-       int i;
-
-       for (i = 0; i < nr_nodemap_entries; i++)
-               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Basic iterator support. Return the next active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit next_active_region_index_in_nid(int index, int nid)
-{
-       for (index = index + 1; index < nr_nodemap_entries; index++)
-               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-                       return index;
-
-       return -1;
-}
-
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 /*
  * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3775,15 +3725,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
  */
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
-       int i;
-
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               unsigned long start_pfn = early_node_map[i].start_pfn;
-               unsigned long end_pfn = early_node_map[i].end_pfn;
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
 
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
                if (start_pfn <= pfn && pfn < end_pfn)
-                       return early_node_map[i].nid;
-       }
+                       return nid;
        /* This is a memory hole */
        return -1;
 }
@@ -3812,11 +3759,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 }
 #endif
 
-/* Basic iterator support to walk early_node_map[] */
-#define for_each_active_range_index_in_nid(i, nid) \
-       for (i = first_active_region_index_in_nid(nid); i != -1; \
-                               i = next_active_region_index_in_nid(i, nid))
-
 /**
  * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3826,122 +3768,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  * add_active_ranges() contain no holes and may be freed, this
  * this function may be used instead of calling free_bootmem() manually.
  */
-void __init free_bootmem_with_active_regions(int nid,
-                                               unsigned long max_low_pfn)
-{
-       int i;
-
-       for_each_active_range_index_in_nid(i, nid) {
-               unsigned long size_pages = 0;
-               unsigned long end_pfn = early_node_map[i].end_pfn;
-
-               if (early_node_map[i].start_pfn >= max_low_pfn)
-                       continue;
-
-               if (end_pfn > max_low_pfn)
-                       end_pfn = max_low_pfn;
-
-               size_pages = end_pfn - early_node_map[i].start_pfn;
-               free_bootmem_node(NODE_DATA(early_node_map[i].nid),
-                               PFN_PHYS(early_node_map[i].start_pfn),
-                               size_pages << PAGE_SHIFT);
-       }
-}
-
-#ifdef CONFIG_HAVE_MEMBLOCK
-/*
- * Basic iterator support. Return the last range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns last region regardless of node
- */
-static int __meminit last_active_region_index_in_nid(int nid)
+void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
-       int i;
-
-       for (i = nr_nodemap_entries - 1; i >= 0; i--)
-               if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                       return i;
-
-       return -1;
-}
-
-/*
- * Basic iterator support. Return the previous active range of PFNs for a node
- * Note: nid == MAX_NUMNODES returns next region regardless of node
- */
-static int __meminit previous_active_region_index_in_nid(int index, int nid)
-{
-       for (index = index - 1; index >= 0; index--)
-               if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
-                       return index;
-
-       return -1;
-}
-
-#define for_each_active_range_index_in_nid_reverse(i, nid) \
-       for (i = last_active_region_index_in_nid(nid); i != -1; \
-                               i = previous_active_region_index_in_nid(i, nid))
-
-u64 __init find_memory_core_early(int nid, u64 size, u64 align,
-                                       u64 goal, u64 limit)
-{
-       int i;
-
-       /* Need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid_reverse(i, nid) {
-               u64 addr;
-               u64 ei_start, ei_last;
-               u64 final_start, final_end;
-
-               ei_last = early_node_map[i].end_pfn;
-               ei_last <<= PAGE_SHIFT;
-               ei_start = early_node_map[i].start_pfn;
-               ei_start <<= PAGE_SHIFT;
-
-               final_start = max(ei_start, goal);
-               final_end = min(ei_last, limit);
-
-               if (final_start >= final_end)
-                       continue;
-
-               addr = memblock_find_in_range(final_start, final_end, size, align);
+       unsigned long start_pfn, end_pfn;
+       int i, this_nid;
 
-               if (addr == MEMBLOCK_ERROR)
-                       continue;
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
+               start_pfn = min(start_pfn, max_low_pfn);
+               end_pfn = min(end_pfn, max_low_pfn);
 
-               return addr;
+               if (start_pfn < end_pfn)
+                       free_bootmem_node(NODE_DATA(this_nid),
+                                         PFN_PHYS(start_pfn),
+                                         (end_pfn - start_pfn) << PAGE_SHIFT);
        }
-
-       return MEMBLOCK_ERROR;
 }
-#endif
 
 int __init add_from_early_node_map(struct range *range, int az,
                                   int nr_range, int nid)
 {
+       unsigned long start_pfn, end_pfn;
        int i;
-       u64 start, end;
 
        /* need to go over early_node_map to find out good range for node */
-       for_each_active_range_index_in_nid(i, nid) {
-               start = early_node_map[i].start_pfn;
-               end = early_node_map[i].end_pfn;
-               nr_range = add_range(range, az, nr_range, start, end);
-       }
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
+               nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
        return nr_range;
 }
 
-void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
-{
-       int i;
-       int ret;
-
-       for_each_active_range_index_in_nid(i, nid) {
-               ret = work_fn(early_node_map[i].start_pfn,
-                             early_node_map[i].end_pfn, data);
-               if (ret)
-                       break;
-       }
-}
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3952,12 +3806,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
  */
 void __init sparse_memory_present_with_active_regions(int nid)
 {
-       int i;
+       unsigned long start_pfn, end_pfn;
+       int i, this_nid;
 
-       for_each_active_range_index_in_nid(i, nid)
-               memory_present(early_node_map[i].nid,
-                               early_node_map[i].start_pfn,
-                               early_node_map[i].end_pfn);
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
+               memory_present(this_nid, start_pfn, end_pfn);
 }
 
 /**
@@ -3974,13 +3827,15 @@ void __init sparse_memory_present_with_active_regions(int nid)
 void __meminit get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn)
 {
+       unsigned long this_start_pfn, this_end_pfn;
        int i;
+
        *start_pfn = -1UL;
        *end_pfn = 0;
 
-       for_each_active_range_index_in_nid(i, nid) {
-               *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
-               *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
+       for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
+               *start_pfn = min(*start_pfn, this_start_pfn);
+               *end_pfn = max(*end_pfn, this_end_pfn);
        }
 
        if (*start_pfn == -1UL)
@@ -4083,46 +3938,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
-       int i = 0;
-       unsigned long prev_end_pfn = 0, hole_pages = 0;
-       unsigned long start_pfn;
-
-       /* Find the end_pfn of the first active range of pfns in the node */
-       i = first_active_region_index_in_nid(nid);
-       if (i == -1)
-               return 0;
-
-       prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-
-       /* Account for ranges before physical memory on this node */
-       if (early_node_map[i].start_pfn > range_start_pfn)
-               hole_pages = prev_end_pfn - range_start_pfn;
-
-       /* Find all holes for the zone within the node */
-       for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
-
-               /* No need to continue if prev_end_pfn is outside the zone */
-               if (prev_end_pfn >= range_end_pfn)
-                       break;
-
-               /* Make sure the end of the zone is not within the hole */
-               start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
-               prev_end_pfn = max(prev_end_pfn, range_start_pfn);
+       unsigned long nr_absent = range_end_pfn - range_start_pfn;
+       unsigned long start_pfn, end_pfn;
+       int i;
 
-               /* Update the hole size cound and move on */
-               if (start_pfn > range_start_pfn) {
-                       BUG_ON(prev_end_pfn > start_pfn);
-                       hole_pages += start_pfn - prev_end_pfn;
-               }
-               prev_end_pfn = early_node_map[i].end_pfn;
+       for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+               start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+               end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+               nr_absent -= end_pfn - start_pfn;
        }
-
-       /* Account for ranges past physical memory on this node */
-       if (range_end_pfn > prev_end_pfn)
-               hole_pages += range_end_pfn -
-                               max(range_start_pfn, prev_end_pfn);
-
-       return hole_pages;
+       return nr_absent;
 }
 
 /**
@@ -4143,14 +3968,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *ignored)
 {
+       unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+       unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long node_start_pfn, node_end_pfn;
        unsigned long zone_start_pfn, zone_end_pfn;
 
        get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
-       zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
-                                                       node_start_pfn);
-       zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
-                                                       node_end_pfn);
+       zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+       zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
 
        adjust_zone_range_for_zone_movable(nid, zone_type,
                        node_start_pfn, node_end_pfn,
@@ -4158,7 +3983,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
 }
 
-#else
+#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *zones_size)
@@ -4176,7 +4001,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
        return zholes_size[zone_type];
 }
 
-#endif
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4399,10 +4224,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
         */
        if (pgdat == NODE_DATA(0)) {
                mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
                        mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
        }
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4427,7 +4252,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        free_area_init_core(pgdat, zones_size, zholes_size);
 }
 
-#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
 #if MAX_NUMNODES > 1
 /*
@@ -4448,170 +4273,6 @@ static inline void setup_nr_node_ids(void)
 }
 #endif
 
-/**
- * add_active_range - Register a range of PFNs backed by physical memory
- * @nid: The node ID the range resides on
- * @start_pfn: The start PFN of the available physical memory
- * @end_pfn: The end PFN of the available physical memory
- *
- * These ranges are stored in an early_node_map[] and later used by
- * free_area_init_nodes() to calculate zone sizes and holes. If the
- * range spans a memory hole, it is up to the architecture to ensure
- * the memory is not freed by the bootmem allocator. If possible
- * the range being registered will be merged with existing ranges.
- */
-void __init add_active_range(unsigned int nid, unsigned long start_pfn,
-                                               unsigned long end_pfn)
-{
-       int i;
-
-       mminit_dprintk(MMINIT_TRACE, "memory_register",
-                       "Entering add_active_range(%d, %#lx, %#lx) "
-                       "%d entries of %d used\n",
-                       nid, start_pfn, end_pfn,
-                       nr_nodemap_entries, MAX_ACTIVE_REGIONS);
-
-       mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
-
-       /* Merge with existing active regions if possible */
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               if (early_node_map[i].nid != nid)
-                       continue;
-
-               /* Skip if an existing region covers this new one */
-               if (start_pfn >= early_node_map[i].start_pfn &&
-                               end_pfn <= early_node_map[i].end_pfn)
-                       return;
-
-               /* Merge forward if suitable */
-               if (start_pfn <= early_node_map[i].end_pfn &&
-                               end_pfn > early_node_map[i].end_pfn) {
-                       early_node_map[i].end_pfn = end_pfn;
-                       return;
-               }
-
-               /* Merge backward if suitable */
-               if (start_pfn < early_node_map[i].start_pfn &&
-                               end_pfn >= early_node_map[i].start_pfn) {
-                       early_node_map[i].start_pfn = start_pfn;
-                       return;
-               }
-       }
-
-       /* Check that early_node_map is large enough */
-       if (i >= MAX_ACTIVE_REGIONS) {
-               printk(KERN_CRIT "More than %d memory regions, truncating\n",
-                                                       MAX_ACTIVE_REGIONS);
-               return;
-       }
-
-       early_node_map[i].nid = nid;
-       early_node_map[i].start_pfn = start_pfn;
-       early_node_map[i].end_pfn = end_pfn;
-       nr_nodemap_entries = i + 1;
-}
-
-/**
- * remove_active_range - Shrink an existing registered range of PFNs
- * @nid: The node id the range is on that should be shrunk
- * @start_pfn: The new PFN of the range
- * @end_pfn: The new PFN of the range
- *
- * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept near the end physical page range that has already been
- * registered. This function allows an arch to shrink an existing registered
- * range.
- */
-void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
-                               unsigned long end_pfn)
-{
-       int i, j;
-       int removed = 0;
-
-       printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
-                         nid, start_pfn, end_pfn);
-
-       /* Find the old active region end and shrink */
-       for_each_active_range_index_in_nid(i, nid) {
-               if (early_node_map[i].start_pfn >= start_pfn &&
-                   early_node_map[i].end_pfn <= end_pfn) {
-                       /* clear it */
-                       early_node_map[i].start_pfn = 0;
-                       early_node_map[i].end_pfn = 0;
-                       removed = 1;
-                       continue;
-               }
-               if (early_node_map[i].start_pfn < start_pfn &&
-                   early_node_map[i].end_pfn > start_pfn) {
-                       unsigned long temp_end_pfn = early_node_map[i].end_pfn;
-                       early_node_map[i].end_pfn = start_pfn;
-                       if (temp_end_pfn > end_pfn)
-                               add_active_range(nid, end_pfn, temp_end_pfn);
-                       continue;
-               }
-               if (early_node_map[i].start_pfn >= start_pfn &&
-                   early_node_map[i].end_pfn > end_pfn &&
-                   early_node_map[i].start_pfn < end_pfn) {
-                       early_node_map[i].start_pfn = end_pfn;
-                       continue;
-               }
-       }
-
-       if (!removed)
-               return;
-
-       /* remove the blank ones */
-       for (i = nr_nodemap_entries - 1; i > 0; i--) {
-               if (early_node_map[i].nid != nid)
-                       continue;
-               if (early_node_map[i].end_pfn)
-                       continue;
-               /* we found it, get rid of it */
-               for (j = i; j < nr_nodemap_entries - 1; j++)
-                       memcpy(&early_node_map[j], &early_node_map[j+1],
-                               sizeof(early_node_map[j]));
-               j = nr_nodemap_entries - 1;
-               memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
-               nr_nodemap_entries--;
-       }
-}
-
-/**
- * remove_all_active_ranges - Remove all currently registered regions
- *
- * During discovery, it may be found that a table like SRAT is invalid
- * and an alternative discovery method must be used. This function removes
- * all currently registered regions.
- */
-void __init remove_all_active_ranges(void)
-{
-       memset(early_node_map, 0, sizeof(early_node_map));
-       nr_nodemap_entries = 0;
-}
-
-/* Compare two active node_active_regions */
-static int __init cmp_node_active_region(const void *a, const void *b)
-{
-       struct node_active_region *arange = (struct node_active_region *)a;
-       struct node_active_region *brange = (struct node_active_region *)b;
-
-       /* Done this way to avoid overflows */
-       if (arange->start_pfn > brange->start_pfn)
-               return 1;
-       if (arange->start_pfn < brange->start_pfn)
-               return -1;
-
-       return 0;
-}
-
-/* sort the node_map by start_pfn */
-void __init sort_node_map(void)
-{
-       sort(early_node_map, (size_t)nr_nodemap_entries,
-                       sizeof(struct node_active_region),
-                       cmp_node_active_region, NULL);
-}
-
 /**
  * node_map_pfn_alignment - determine the maximum internode alignment
  *
@@ -4634,15 +4295,11 @@ void __init sort_node_map(void)
 unsigned long __init node_map_pfn_alignment(void)
 {
        unsigned long accl_mask = 0, last_end = 0;
+       unsigned long start, end, mask;
        int last_nid = -1;
-       int i;
-
-       for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
-               int nid = early_node_map[i].nid;
-               unsigned long start = early_node_map[i].start_pfn;
-               unsigned long end = early_node_map[i].end_pfn;
-               unsigned long mask;
+       int i, nid;
 
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
                if (!start || last_nid < 0 || last_nid == nid) {
                        last_nid = nid;
                        last_end = end;
@@ -4669,12 +4326,12 @@ unsigned long __init node_map_pfn_alignment(void)
 /* Find the lowest pfn for a node */
 static unsigned long __init find_min_pfn_for_node(int nid)
 {
-       int i;
        unsigned long min_pfn = ULONG_MAX;
+       unsigned long start_pfn;
+       int i;
 
-       /* Assuming a sorted map, the first range found has the starting pfn */
-       for_each_active_range_index_in_nid(i, nid)
-               min_pfn = min(min_pfn, early_node_map[i].start_pfn);
+       for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
+               min_pfn = min(min_pfn, start_pfn);
 
        if (min_pfn == ULONG_MAX) {
                printk(KERN_WARNING
@@ -4703,15 +4360,16 @@ unsigned long __init find_min_pfn_with_active_regions(void)
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
-       int i;
        unsigned long totalpages = 0;
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
+
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+               unsigned long pages = end_pfn - start_pfn;
 
-       for (i = 0; i < nr_nodemap_entries; i++) {
-               unsigned long pages = early_node_map[i].end_pfn -
-                                               early_node_map[i].start_pfn;
                totalpages += pages;
                if (pages)
-                       node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+                       node_set_state(nid, N_HIGH_MEMORY);
        }
        return totalpages;
 }
@@ -4766,6 +4424,8 @@ restart:
        /* Spread kernelcore memory as evenly as possible throughout nodes */
        kernelcore_node = required_kernelcore / usable_nodes;
        for_each_node_state(nid, N_HIGH_MEMORY) {
+               unsigned long start_pfn, end_pfn;
+
                /*
                 * Recalculate kernelcore_node if the division per node
                 * now exceeds what is necessary to satisfy the requested
@@ -4782,13 +4442,10 @@ restart:
                kernelcore_remaining = kernelcore_node;
 
                /* Go through each range of PFNs within this node */
-               for_each_active_range_index_in_nid(i, nid) {
-                       unsigned long start_pfn, end_pfn;
+               for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
                        unsigned long size_pages;
 
-                       start_pfn = max(early_node_map[i].start_pfn,
-                                               zone_movable_pfn[nid]);
-                       end_pfn = early_node_map[i].end_pfn;
+                       start_pfn = max(start_pfn, zone_movable_pfn[nid]);
                        if (start_pfn >= end_pfn)
                                continue;
 
@@ -4890,11 +4547,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)
  */
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
-       unsigned long nid;
-       int i;
-
-       /* Sort early_node_map as initialisation assumes it is sorted */
-       sort_node_map();
+       unsigned long start_pfn, end_pfn;
+       int i, nid;
 
        /* Record where the zone boundaries are */
        memset(arch_zone_lowest_possible_pfn, 0,
@@ -4941,11 +4595,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        }
 
        /* Print out the early_node_map[] */
-       printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
-       for (i = 0; i < nr_nodemap_entries; i++)
-               printk("  %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
-                                               early_node_map[i].start_pfn,
-                                               early_node_map[i].end_pfn);
+       printk("Early memory PFN ranges\n");
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+               printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
 
        /* Initialise every node */
        mminit_verify_pageflags_layout();
@@ -4998,7 +4650,7 @@ static int __init cmdline_parse_movablecore(char *p)
 early_param("kernelcore", cmdline_parse_kernelcore);
 early_param("movablecore", cmdline_parse_movablecore);
 
-#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
index ed3334d..09ccee8 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -368,7 +368,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
        VM_BUG_ON(!irqs_disabled());
 #ifdef CONFIG_CMPXCHG_DOUBLE
        if (s->flags & __CMPXCHG_DOUBLE) {
-               if (cmpxchg_double(&page->freelist,
+               if (cmpxchg_double(&page->freelist, &page->counters,
                        freelist_old, counters_old,
                        freelist_new, counters_new))
                return 1;
@@ -402,7 +402,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
 {
 #ifdef CONFIG_CMPXCHG_DOUBLE
        if (s->flags & __CMPXCHG_DOUBLE) {
-               if (cmpxchg_double(&page->freelist,
+               if (cmpxchg_double(&page->freelist, &page->counters,
                        freelist_old, counters_old,
                        freelist_new, counters_new))
                return 1;
index 2cad581..e56162c 100644 (file)
@@ -2903,7 +2903,7 @@ static int bond_ioctl(struct net *net, unsigned int cmd,
 
                return dev_ioctl(net, cmd, uifr);
        default:
-               return -EINVAL;
+               return -ENOIOCTLCMD;
        }
 }
 
@@ -3230,20 +3230,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
                return sock_do_ioctl(net, sock, cmd, arg);
        }
 
-       /* Prevent warning from compat_sys_ioctl, these always
-        * result in -EINVAL in the native case anyway. */
-       switch (cmd) {
-       case SIOCRTMSG:
-       case SIOCGIFCOUNT:
-       case SIOCSRARP:
-       case SIOCGRARP:
-       case SIOCDRARP:
-       case SIOCSIFLINK:
-       case SIOCGIFSLAVE:
-       case SIOCSIFSLAVE:
-               return -EINVAL;
-       }
-
        return -ENOIOCTLCMD;
 }
 
index fe6762e..c89f9e1 100644 (file)
@@ -22,7 +22,7 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 -d::
 --dsos=<dso[,dso...]>::
@@ -66,7 +66,7 @@ OPTIONS
        used. This interfaces starts by centering on the line with more
        samples, TAB/UNTAB cycles through the lines with more samples.
 
--c::
+-C::
 --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
        be provided as a comma-separated list with no space: 0,1. Ranges of
        CPUs are specified with -: 0-2. Default is to report samples on all
index cc22325..25c52ef 100644 (file)
@@ -26,7 +26,7 @@ OPTIONS
         Show only DSOs with hits.
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 -f::
 --force::
        Don't do ownership validation.
index 0cada9e..0507ec7 100644 (file)
@@ -18,7 +18,7 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 SEE ALSO
 --------
index a52fcde..7c8fbbf 100644 (file)
@@ -23,7 +23,7 @@ OPTIONS
 -------
 -i <file>::
 --input=<file>::
-       Select the input file (default: perf.data)
+       Select the input file (default: perf.data unless stdin is a fifo)
 
 --caller::
        Show per-callsite statistics
index 4a26a2f..d6b2a4f 100644 (file)
@@ -29,7 +29,7 @@ COMMON OPTIONS
 
 -i::
 --input=<file>::
-        Input file name.
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 -v::
 --verbose::
index 5a520f8..2937f7e 100644 (file)
@@ -89,7 +89,7 @@ OPTIONS
 
 -m::
 --mmap-pages=::
-       Number of mmap data pages.
+       Number of mmap data pages. Must be a power of two.
 
 -g::
 --call-graph::
index 212f24d..9b430e9 100644 (file)
@@ -19,7 +19,7 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 -v::
 --verbose::
@@ -39,7 +39,7 @@ OPTIONS
 -T::
 --threads::
        Show per-thread event counters
--C::
+-c::
 --comms=::
        Only consider symbols in these comms. CSV that understands
        file://filename entries.
@@ -80,9 +80,10 @@ OPTIONS
 --dump-raw-trace::
         Dump raw trace in ASCII.
 
--g [type,min,order]::
+-g [type,min[,limit],order]::
 --call-graph::
-        Display call chains using type, min percent threshold and order.
+        Display call chains using type, min percent threshold, optional print
+       limit and order.
        type can be either:
        - flat: single column, linear exposure of call chains.
        - graph: use a graph tree, displaying absolute overhead rates.
@@ -128,7 +129,7 @@ OPTIONS
 --symfs=<directory>::
         Look for files with symbols relative to this directory.
 
--c::
+-C::
 --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
        be provided as a comma-separated list with no space: 0,1. Ranges of
        CPUs are specified with -: 0-2. Default is to report samples on all
index 5b212b5..8ff4df9 100644 (file)
@@ -40,7 +40,7 @@ OPTIONS
 -------
 -i::
 --input=<file>::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 -v::
 --verbose::
index dec87ec..2f6cef4 100644 (file)
@@ -106,7 +106,7 @@ OPTIONS
 
 -i::
 --input=::
-        Input file name.
+        Input file name. (default: perf.data unless stdin is a fifo)
 
 -d::
 --debug-mode::
@@ -182,12 +182,17 @@ OPTIONS
 --hide-call-graph::
         When printing symbols do not display call chain.
 
--c::
+-C::
 --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
        be provided as a comma-separated list with no space: 0,1. Ranges of
        CPUs are specified with -: 0-2. Default is to report samples on all
        CPUs.
 
+-c::
+--comms=::
+       Only display events for these comms. CSV that understands
+       file://filename entries.
+
 -I::
 --show-info::
        Display extended information about the perf.data file. This adds
index 2c3b462..b24ac40 100644 (file)
@@ -8,13 +8,19 @@ perf-test - Runs sanity tests.
 SYNOPSIS
 --------
 [verse]
-'perf test <options>'
+'perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]'
 
 DESCRIPTION
 -----------
 This command does assorted sanity tests, initially through linked routines but
 also will look for a directory with more tests in the form of scripts.
 
+To get a list of available tests use 'perf test list', specifying a test name
+fragment will show all tests that have it.
+
+To run just specific tests, inform test name fragments or the numbers obtained
+from 'perf test list'.
+
 OPTIONS
 -------
 -v::
index d7b79e2..1632b0e 100644 (file)
@@ -27,7 +27,7 @@ OPTIONS
         Select the output file (default: output.svg)
 -i::
 --input=::
-        Select the input file (default: perf.data)
+        Select the input file (default: perf.data unless stdin is a fifo)
 -w::
 --width=::
         Select the width of the SVG file (default: 1000)
index b98e307..ac86d67 100644 (file)
@@ -278,6 +278,7 @@ LIB_H += util/strbuf.h
 LIB_H += util/strlist.h
 LIB_H += util/strfilter.h
 LIB_H += util/svghelper.h
+LIB_H += util/tool.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
index 46b4c24..214ba7f 100644 (file)
 #include "util/sort.h"
 #include "util/hist.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include <linux/bitmap.h>
 
-static char            const *input_name = "perf.data";
-
-static bool            force, use_tui, use_stdio;
-
-static bool            full_paths;
-
-static bool            print_line;
-
-static const char *sym_hist_filter;
-
-static const char      *cpu_list;
-static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+struct perf_annotate {
+       struct perf_tool tool;
+       char const *input_name;
+       bool       force, use_tui, use_stdio;
+       bool       full_paths;
+       bool       print_line;
+       const char *sym_hist_filter;
+       const char *cpu_list;
+       DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+};
 
-static int perf_evlist__add_sample(struct perf_evlist *evlist,
-                                  struct perf_sample *sample,
-                                  struct perf_evsel *evsel,
-                                  struct addr_location *al)
+static int perf_evsel__add_sample(struct perf_evsel *evsel,
+                                 struct perf_sample *sample,
+                                 struct addr_location *al,
+                                 struct perf_annotate *ann)
 {
        struct hist_entry *he;
        int ret;
 
-       if (sym_hist_filter != NULL &&
-           (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) {
+       if (ann->sym_hist_filter != NULL &&
+           (al->sym == NULL ||
+            strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
                /* We're only interested in a symbol named sym_hist_filter */
                if (al->sym != NULL) {
                        rb_erase(&al->sym->rb_node,
@@ -69,8 +69,7 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist,
        ret = 0;
        if (he->ms.sym != NULL) {
                struct annotation *notes = symbol__annotation(he->ms.sym);
-               if (notes->src == NULL &&
-                   symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0)
+               if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0)
                        return -ENOMEM;
 
                ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
@@ -81,25 +80,26 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist,
        return ret;
 }
 
-static int process_sample_event(union perf_event *event,
+static int process_sample_event(struct perf_tool *tool,
+                               union perf_event *event,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel,
-                               struct perf_session *session)
+                               struct machine *machine)
 {
+       struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
        struct addr_location al;
 
-       if (perf_event__preprocess_sample(event, session, &al, sample,
+       if (perf_event__preprocess_sample(event, machine, &al, sample,
                                          symbol__annotate_init) < 0) {
                pr_warning("problem processing %d event, skipping it.\n",
                           event->header.type);
                return -1;
        }
 
-       if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
+       if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
                return 0;
 
-       if (!al.filtered &&
-           perf_evlist__add_sample(session->evlist, sample, evsel, &al)) {
+       if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
                pr_warning("problem incrementing symbol count, "
                           "skipping event\n");
                return -1;
@@ -108,14 +108,15 @@ static int process_sample_event(union perf_event *event,
        return 0;
 }
 
-static int hist_entry__tty_annotate(struct hist_entry *he, int evidx)
+static int hist_entry__tty_annotate(struct hist_entry *he, int evidx,
+                                   struct perf_annotate *ann)
 {
        return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx,
-                                   print_line, full_paths, 0, 0);
+                                   ann->print_line, ann->full_paths, 0, 0);
 }
 
 static void hists__find_annotations(struct hists *self, int evidx,
-                                   int nr_events)
+                                   struct perf_annotate *ann)
 {
        struct rb_node *nd = rb_first(&self->entries), *next;
        int key = K_RIGHT;
@@ -138,8 +139,7 @@ find_next:
                }
 
                if (use_browser > 0) {
-                       key = hist_entry__tui_annotate(he, evidx, nr_events,
-                                                      NULL, NULL, 0);
+                       key = hist_entry__tui_annotate(he, evidx, NULL, NULL, 0);
                        switch (key) {
                        case K_RIGHT:
                                next = rb_next(nd);
@@ -154,7 +154,7 @@ find_next:
                        if (next != NULL)
                                nd = next;
                } else {
-                       hist_entry__tty_annotate(he, evidx);
+                       hist_entry__tty_annotate(he, evidx, ann);
                        nd = rb_next(nd);
                        /*
                         * Since we have a hist_entry per IP for the same
@@ -167,33 +167,26 @@ find_next:
        }
 }
 
-static struct perf_event_ops event_ops = {
-       .sample = process_sample_event,
-       .mmap   = perf_event__process_mmap,
-       .comm   = perf_event__process_comm,
-       .fork   = perf_event__process_task,
-       .ordered_samples = true,
-       .ordering_requires_timestamps = true,
-};
-
-static int __cmd_annotate(void)
+static int __cmd_annotate(struct perf_annotate *ann)
 {
        int ret;
        struct perf_session *session;
        struct perf_evsel *pos;
        u64 total_nr_samples;
 
-       session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
+       session = perf_session__new(ann->input_name, O_RDONLY,
+                                   ann->force, false, &ann->tool);
        if (session == NULL)
                return -ENOMEM;
 
-       if (cpu_list) {
-               ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
+       if (ann->cpu_list) {
+               ret = perf_session__cpu_bitmap(session, ann->cpu_list,
+                                              ann->cpu_bitmap);
                if (ret)
                        goto out_delete;
        }
 
-       ret = perf_session__process_events(session, &event_ops);
+       ret = perf_session__process_events(session, &ann->tool);
        if (ret)
                goto out_delete;
 
@@ -217,13 +210,12 @@ static int __cmd_annotate(void)
                        total_nr_samples += nr_samples;
                        hists__collapse_resort(hists);
                        hists__output_resort(hists);
-                       hists__find_annotations(hists, pos->idx,
-                                               session->evlist->nr_entries);
+                       hists__find_annotations(hists, pos->idx, ann);
                }
        }
 
        if (total_nr_samples == 0) {
-               ui__warning("The %s file has no samples!\n", input_name);
+               ui__warning("The %s file has no samples!\n", session->filename);
                goto out_delete;
        }
 out_delete:
@@ -247,29 +239,41 @@ static const char * const annotate_usage[] = {
        NULL
 };
 
-static const struct option options[] = {
-       OPT_STRING('i', "input", &input_name, "file",
+int cmd_annotate(int argc, const char **argv, const char *prefix __used)
+{
+       struct perf_annotate annotate = {
+               .tool = {
+                       .sample = process_sample_event,
+                       .mmap   = perf_event__process_mmap,
+                       .comm   = perf_event__process_comm,
+                       .fork   = perf_event__process_task,
+                       .ordered_samples = true,
+                       .ordering_requires_timestamps = true,
+               },
+       };
+       const struct option options[] = {
+       OPT_STRING('i', "input", &annotate.input_name, "file",
                    "input file name"),
        OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
                   "only consider symbols in these dsos"),
-       OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
+       OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
                    "symbol to annotate"),
-       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+       OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
-       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
-       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
+       OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"),
        OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                   "file", "vmlinux pathname"),
        OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
                    "load module symbols - WARNING: use only with -k and LIVE kernel"),
-       OPT_BOOLEAN('l', "print-line", &print_line,
+       OPT_BOOLEAN('l', "print-line", &annotate.print_line,
                    "print matching source lines (may be slow)"),
-       OPT_BOOLEAN('P', "full-paths", &full_paths,
+       OPT_BOOLEAN('P', "full-paths", &annotate.full_paths,
                    "Don't shorten the displayed pathnames"),
-       OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+       OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
        OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
                   "Look for files with symbols relative to this directory"),
        OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
@@ -279,15 +283,13 @@ static const struct option options[] = {
        OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
                   "Specify disassembler style (e.g. -M intel for intel syntax)"),
        OPT_END()
-};
+       };
 
-int cmd_annotate(int argc, const char **argv, const char *prefix __used)
-{
        argc = parse_options(argc, argv, options, annotate_usage, 0);
 
-       if (use_stdio)
+       if (annotate.use_stdio)
                use_browser = 0;
-       else if (use_tui)
+       else if (annotate.use_tui)
                use_browser = 1;
 
        setup_browser(true);
@@ -308,7 +310,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
                if (argc > 1)
                        usage_with_options(annotate_usage, options);
 
-               sym_hist_filter = argv[0];
+               annotate.sym_hist_filter = argv[0];
        }
 
        if (field_sep && *field_sep == '.') {
@@ -316,5 +318,5 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used)
                return -1;
        }
 
-       return __cmd_annotate();
+       return __cmd_annotate(&annotate);
 }
index cb690a6..5248046 100644 (file)
@@ -18,7 +18,7 @@
 
 #include <libelf.h>
 
-static char const *input_name = "perf.data";
+static const char *input_name;
 static bool force;
 static bool show_kernel;
 static bool with_hits;
@@ -39,24 +39,6 @@ static const struct option options[] = {
        OPT_END()
 };
 
-static int perf_session__list_build_ids(void)
-{
-       struct perf_session *session;
-
-       session = perf_session__new(input_name, O_RDONLY, force, false,
-                                   &build_id__mark_dso_hit_ops);
-       if (session == NULL)
-               return -1;
-
-       if (with_hits)
-               perf_session__process_events(session, &build_id__mark_dso_hit_ops);
-
-       perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
-
-       perf_session__delete(session);
-       return 0;
-}
-
 static int sysfs__fprintf_build_id(FILE *fp)
 {
        u8 kallsyms_build_id[BUILD_ID_SIZE];
@@ -85,17 +67,36 @@ static int filename__fprintf_build_id(const char *name, FILE *fp)
        return fprintf(fp, "%s\n", sbuild_id);
 }
 
-static int __cmd_buildid_list(void)
+static int perf_session__list_build_ids(void)
 {
-       if (show_kernel)
-               return sysfs__fprintf_build_id(stdout);
+       struct perf_session *session;
 
        elf_version(EV_CURRENT);
+
+       session = perf_session__new(input_name, O_RDONLY, force, false,
+                                   &build_id__mark_dso_hit_ops);
+       if (session == NULL)
+               return -1;
+
        /*
-        * See if this is an ELF file first:
-        */
-       if (filename__fprintf_build_id(input_name, stdout))
-               return 0;
+        * See if this is an ELF file first:
+        */
+       if (filename__fprintf_build_id(session->filename, stdout))
+               goto out;
+
+       if (with_hits)
+               perf_session__process_events(session, &build_id__mark_dso_hit_ops);
+
+       perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
+out:
+       perf_session__delete(session);
+       return 0;
+}
+
+static int __cmd_buildid_list(void)
+{
+       if (show_kernel)
+               return sysfs__fprintf_build_id(stdout);
 
        return perf_session__list_build_ids();
 }
index b39f3a1..4f19513 100644 (file)
@@ -9,7 +9,9 @@
 #include "util/debug.h"
 #include "util/event.h"
 #include "util/hist.h"
+#include "util/evsel.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/sort.h"
 #include "util/symbol.h"
 #include "util/util.h"
@@ -30,14 +32,15 @@ static int hists__add_entry(struct hists *self,
        return -ENOMEM;
 }
 
-static int diff__process_sample_event(union perf_event *event,
+static int diff__process_sample_event(struct perf_tool *tool __used,
+                                     union perf_event *event,
                                      struct perf_sample *sample,
                                      struct perf_evsel *evsel __used,
-                                     struct perf_session *session)
+                                     struct machine *machine)
 {
        struct addr_location al;
 
-       if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
+       if (perf_event__preprocess_sample(event, machine, &al, sample, NULL) < 0) {
                pr_warning("problem processing %d event, skipping it.\n",
                           event->header.type);
                return -1;
@@ -46,16 +49,16 @@ static int diff__process_sample_event(union perf_event *event,
        if (al.filtered || al.sym == NULL)
                return 0;
 
-       if (hists__add_entry(&session->hists, &al, sample->period)) {
+       if (hists__add_entry(&evsel->hists, &al, sample->period)) {
                pr_warning("problem incrementing symbol period, skipping event\n");
                return -1;
        }
 
-       session->hists.stats.total_period += sample->period;
+       evsel->hists.stats.total_period += sample->period;
        return 0;
 }
 
-static struct perf_event_ops event_ops = {
+static struct perf_tool perf_diff = {
        .sample = diff__process_sample_event,
        .mmap   = perf_event__process_mmap,
        .comm   = perf_event__process_comm,
@@ -145,13 +148,13 @@ static int __cmd_diff(void)
        int ret, i;
        struct perf_session *session[2];
 
-       session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops);
-       session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops);
+       session[0] = perf_session__new(input_old, O_RDONLY, force, false, &perf_diff);
+       session[1] = perf_session__new(input_new, O_RDONLY, force, false, &perf_diff);
        if (session[0] == NULL || session[1] == NULL)
                return -ENOMEM;
 
        for (i = 0; i < 2; ++i) {
-               ret = perf_session__process_events(session[i], &event_ops);
+               ret = perf_session__process_events(session[i], &perf_diff);
                if (ret)
                        goto out_delete;
        }
index 4c5e9e0..2676032 100644 (file)
@@ -15,7 +15,7 @@
 #include "util/parse-options.h"
 #include "util/session.h"
 
-static char const *input_name = "perf.data";
+static const char *input_name;
 
 static int __cmd_evlist(void)
 {
index 8dfc12b..09c1061 100644 (file)
@@ -9,6 +9,7 @@
 
 #include "perf.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/debug.h"
 
 #include "util/parse-options.h"
@@ -16,8 +17,9 @@
 static char            const *input_name = "-";
 static bool            inject_build_ids;
 
-static int perf_event__repipe_synth(union perf_event *event,
-                                   struct perf_session *session __used)
+static int perf_event__repipe_synth(struct perf_tool *tool __used,
+                                   union perf_event *event,
+                                   struct machine *machine __used)
 {
        uint32_t size;
        void *buf = event;
@@ -36,41 +38,70 @@ static int perf_event__repipe_synth(union perf_event *event,
        return 0;
 }
 
-static int perf_event__repipe(union perf_event *event,
+static int perf_event__repipe_op2_synth(struct perf_tool *tool,
+                                       union perf_event *event,
+                                       struct perf_session *session __used)
+{
+       return perf_event__repipe_synth(tool, event, NULL);
+}
+
+static int perf_event__repipe_event_type_synth(struct perf_tool *tool,
+                                              union perf_event *event)
+{
+       return perf_event__repipe_synth(tool, event, NULL);
+}
+
+static int perf_event__repipe_tracing_data_synth(union perf_event *event,
+                                                struct perf_session *session __used)
+{
+       return perf_event__repipe_synth(NULL, event, NULL);
+}
+
+static int perf_event__repipe_attr(union perf_event *event,
+                                  struct perf_evlist **pevlist __used)
+{
+       return perf_event__repipe_synth(NULL, event, NULL);
+}
+
+static int perf_event__repipe(struct perf_tool *tool,
+                             union perf_event *event,
                              struct perf_sample *sample __used,
-                             struct perf_session *session)
+                             struct machine *machine)
 {
-       return perf_event__repipe_synth(event, session);
+       return perf_event__repipe_synth(tool, event, machine);
 }
 
-static int perf_event__repipe_sample(union perf_event *event,
+static int perf_event__repipe_sample(struct perf_tool *tool,
+                                    union perf_event *event,
                              struct perf_sample *sample __used,
                              struct perf_evsel *evsel __used,
-                             struct perf_session *session)
+                             struct machine *machine)
 {
-       return perf_event__repipe_synth(event, session);
+       return perf_event__repipe_synth(tool, event, machine);
 }
 
-static int perf_event__repipe_mmap(union perf_event *event,
+static int perf_event__repipe_mmap(struct perf_tool *tool,
+                                  union perf_event *event,
                                   struct perf_sample *sample,
-                                  struct perf_session *session)
+                                  struct machine *machine)
 {
        int err;
 
-       err = perf_event__process_mmap(event, sample, session);
-       perf_event__repipe(event, sample, session);
+       err = perf_event__process_mmap(tool, event, sample, machine);
+       perf_event__repipe(tool, event, sample, machine);
 
        return err;
 }
 
-static int perf_event__repipe_task(union perf_event *event,
+static int perf_event__repipe_task(struct perf_tool *tool,
+                                  union perf_event *event,
                                   struct perf_sample *sample,
-                                  struct perf_session *session)
+                                  struct machine *machine)
 {
        int err;
 
-       err = perf_event__process_task(event, sample, session);
-       perf_event__repipe(event, sample, session);
+       err = perf_event__process_task(tool, event, sample, machine);
+       perf_event__repipe(tool, event, sample, machine);
 
        return err;
 }
@@ -80,7 +111,7 @@ static int perf_event__repipe_tracing_data(union perf_event *event,
 {
        int err;
 
-       perf_event__repipe_synth(event, session);
+       perf_event__repipe_synth(NULL, event, NULL);
        err = perf_event__process_tracing_data(event, session);
 
        return err;
@@ -100,10 +131,10 @@ static int dso__read_build_id(struct dso *self)
        return -1;
 }
 
-static int dso__inject_build_id(struct dso *self, struct perf_session *session)
+static int dso__inject_build_id(struct dso *self, struct perf_tool *tool,
+                               struct machine *machine)
 {
        u16 misc = PERF_RECORD_MISC_USER;
-       struct machine *machine;
        int err;
 
        if (dso__read_build_id(self) < 0) {
@@ -111,17 +142,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
                return -1;
        }
 
-       machine = perf_session__find_host_machine(session);
-       if (machine == NULL) {
-               pr_err("Can't find machine for session\n");
-               return -1;
-       }
-
        if (self->kernel)
                misc = PERF_RECORD_MISC_KERNEL;
 
-       err = perf_event__synthesize_build_id(self, misc, perf_event__repipe,
-                                             machine, session);
+       err = perf_event__synthesize_build_id(tool, self, misc, perf_event__repipe,
+                                             machine);
        if (err) {
                pr_err("Can't synthesize build_id event for %s\n", self->long_name);
                return -1;
@@ -130,10 +155,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
        return 0;
 }
 
-static int perf_event__inject_buildid(union perf_event *event,
+static int perf_event__inject_buildid(struct perf_tool *tool,
+                                     union perf_event *event,
                                      struct perf_sample *sample,
                                      struct perf_evsel *evsel __used,
-                                     struct perf_session *session)
+                                     struct machine *machine)
 {
        struct addr_location al;
        struct thread *thread;
@@ -141,21 +167,21 @@ static int perf_event__inject_buildid(union perf_event *event,
 
        cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-       thread = perf_session__findnew(session, event->ip.pid);
+       thread = machine__findnew_thread(machine, event->ip.pid);
        if (thread == NULL) {
                pr_err("problem processing %d event, skipping it.\n",
                       event->header.type);
                goto repipe;
        }
 
-       thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-                             event->ip.pid, event->ip.ip, &al);
+       thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+                             event->ip.ip, &al);
 
        if (al.map != NULL) {
                if (!al.map->dso->hit) {
                        al.map->dso->hit = 1;
                        if (map__load(al.map, NULL) >= 0) {
-                               dso__inject_build_id(al.map->dso, session);
+                               dso__inject_build_id(al.map->dso, tool, machine);
                                /*
                                 * If this fails, too bad, let the other side
                                 * account this as unresolved.
@@ -168,24 +194,24 @@ static int perf_event__inject_buildid(union perf_event *event,
        }
 
 repipe:
-       perf_event__repipe(event, sample, session);
+       perf_event__repipe(tool, event, sample, machine);
        return 0;
 }
 
-struct perf_event_ops inject_ops = {
+struct perf_tool perf_inject = {
        .sample         = perf_event__repipe_sample,
        .mmap           = perf_event__repipe,
        .comm           = perf_event__repipe,
        .fork           = perf_event__repipe,
        .exit           = perf_event__repipe,
        .lost           = perf_event__repipe,
-       .read           = perf_event__repipe,
+       .read           = perf_event__repipe_sample,
        .throttle       = perf_event__repipe,
        .unthrottle     = perf_event__repipe,
-       .attr           = perf_event__repipe_synth,
-       .event_type     = perf_event__repipe_synth,
-       .tracing_data   = perf_event__repipe_synth,
-       .build_id       = perf_event__repipe_synth,
+       .attr           = perf_event__repipe_attr,
+       .event_type     = perf_event__repipe_event_type_synth,
+       .tracing_data   = perf_event__repipe_tracing_data_synth,
+       .build_id       = perf_event__repipe_op2_synth,
 };
 
 extern volatile int session_done;
@@ -203,17 +229,17 @@ static int __cmd_inject(void)
        signal(SIGINT, sig_handler);
 
        if (inject_build_ids) {
-               inject_ops.sample       = perf_event__inject_buildid;
-               inject_ops.mmap         = perf_event__repipe_mmap;
-               inject_ops.fork         = perf_event__repipe_task;
-               inject_ops.tracing_data = perf_event__repipe_tracing_data;
+               perf_inject.sample       = perf_event__inject_buildid;
+               perf_inject.mmap         = perf_event__repipe_mmap;
+               perf_inject.fork         = perf_event__repipe_task;
+               perf_inject.tracing_data = perf_event__repipe_tracing_data;
        }
 
-       session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops);
+       session = perf_session__new(input_name, O_RDONLY, false, true, &perf_inject);
        if (session == NULL)
                return -ENOMEM;
 
-       ret = perf_session__process_events(session, &inject_ops);
+       ret = perf_session__process_events(session, &perf_inject);
 
        perf_session__delete(session);
 
index 225e963..fe1ad8f 100644 (file)
@@ -7,6 +7,7 @@
 #include "util/thread.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
@@ -18,7 +19,7 @@
 struct alloc_stat;
 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
 
-static char const              *input_name = "perf.data";
+static const char              *input_name;
 
 static int                     alloc_flag;
 static int                     caller_flag;
@@ -303,12 +304,13 @@ static void process_raw_event(union perf_event *raw_event __used, void *data,
        }
 }
 
-static int process_sample_event(union perf_event *event,
+static int process_sample_event(struct perf_tool *tool __used,
+                               union perf_event *event,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel __used,
-                               struct perf_session *session)
+                               struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(session, event->ip.pid);
+       struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
 
        if (thread == NULL) {
                pr_debug("problem processing %d event, skipping it.\n",
@@ -324,7 +326,7 @@ static int process_sample_event(union perf_event *event,
        return 0;
 }
 
-static struct perf_event_ops event_ops = {
+static struct perf_tool perf_kmem = {
        .sample                 = process_sample_event,
        .comm                   = perf_event__process_comm,
        .ordered_samples        = true,
@@ -483,7 +485,7 @@ static int __cmd_kmem(void)
 {
        int err = -EINVAL;
        struct perf_session *session = perf_session__new(input_name, O_RDONLY,
-                                                        0, false, &event_ops);
+                                                        0, false, &perf_kmem);
        if (session == NULL)
                return -ENOMEM;
 
@@ -494,7 +496,7 @@ static int __cmd_kmem(void)
                goto out_delete;
 
        setup_pager();
-       err = perf_session__process_events(session, &event_ops);
+       err = perf_session__process_events(session, &perf_kmem);
        if (err != 0)
                goto out_delete;
        sort_result();
index 34d1e85..032324a 100644 (file)
@@ -38,7 +38,7 @@ static const struct option kvm_options[] = {
        OPT_BOOLEAN(0, "guest", &perf_guest,
                    "Collect guest os data"),
        OPT_BOOLEAN(0, "host", &perf_host,
-                   "Collect guest os data"),
+                   "Collect host os data"),
        OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
                   "guest mount directory under which every guest os"
                   " instance has a subdir"),
index 899080a..2296c39 100644 (file)
@@ -12,6 +12,7 @@
 
 #include "util/debug.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include <sys/types.h>
 #include <sys/prctl.h>
@@ -325,7 +326,7 @@ alloc_failed:
        die("memory allocation failed\n");
 }
 
-static char                    const *input_name = "perf.data";
+static const char *input_name;
 
 struct raw_event_sample {
        u32                     size;
@@ -845,12 +846,13 @@ static void dump_info(void)
                die("Unknown type of information\n");
 }
 
-static int process_sample_event(union perf_event *event,
+static int process_sample_event(struct perf_tool *tool __used,
+                               union perf_event *event,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel __used,
-                               struct perf_session *s)
+                               struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(s, sample->tid);
+       struct thread *thread = machine__findnew_thread(machine, sample->tid);
 
        if (thread == NULL) {
                pr_debug("problem processing %d event, skipping it.\n",
@@ -863,7 +865,7 @@ static int process_sample_event(union perf_event *event,
        return 0;
 }
 
-static struct perf_event_ops eops = {
+static struct perf_tool eops = {
        .sample                 = process_sample_event,
        .comm                   = perf_event__process_comm,
        .ordered_samples        = true,
index 710ae3d..59d43ab 100644 (file)
@@ -46,7 +46,6 @@
 
 #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
 #define DEFAULT_FUNC_FILTER "!_*"
-#define MAX_PATH_LEN 256
 
 /* Session management structure */
 static struct {
index 6ab58cc..0abfb18 100644 (file)
@@ -22,6 +22,7 @@
 #include "util/evsel.h"
 #include "util/debug.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/symbol.h"
 #include "util/cpumap.h"
 #include "util/thread_map.h"
@@ -35,55 +36,36 @@ enum write_mode_t {
        WRITE_APPEND
 };
 
-static u64                     user_interval                   = ULLONG_MAX;
-static u64                     default_interval                =      0;
-
-static unsigned int            page_size;
-static unsigned int            mmap_pages                      = UINT_MAX;
-static unsigned int            user_freq                       = UINT_MAX;
-static int                     freq                            =   1000;
-static int                     output;
-static int                     pipe_output                     =      0;
-static const char              *output_name                    = NULL;
-static bool                    group                           =  false;
-static int                     realtime_prio                   =      0;
-static bool                    nodelay                         =  false;
-static bool                    raw_samples                     =  false;
-static bool                    sample_id_all_avail             =   true;
-static bool                    system_wide                     =  false;
-static pid_t                   target_pid                      =     -1;
-static pid_t                   target_tid                      =     -1;
-static pid_t                   child_pid                       =     -1;
-static bool                    no_inherit                      =  false;
-static enum write_mode_t       write_mode                      = WRITE_FORCE;
-static bool                    call_graph                      =  false;
-static bool                    inherit_stat                    =  false;
-static bool                    no_samples                      =  false;
-static bool                    sample_address                  =  false;
-static bool                    sample_time                     =  false;
-static bool                    no_buildid                      =  false;
-static bool                    no_buildid_cache                =  false;
-static struct perf_evlist      *evsel_list;
-
-static long                    samples                         =      0;
-static u64                     bytes_written                   =      0;
-
-static int                     file_new                        =      1;
-static off_t                   post_processing_offset;
-
-static struct perf_session     *session;
-static const char              *cpu_list;
-static const char               *progname;
-
-static void advance_output(size_t size)
+struct perf_record {
+       struct perf_tool        tool;
+       struct perf_record_opts opts;
+       u64                     bytes_written;
+       const char              *output_name;
+       struct perf_evlist      *evlist;
+       struct perf_session     *session;
+       const char              *progname;
+       int                     output;
+       unsigned int            page_size;
+       int                     realtime_prio;
+       enum write_mode_t       write_mode;
+       bool                    no_buildid;
+       bool                    no_buildid_cache;
+       bool                    force;
+       bool                    file_new;
+       bool                    append_file;
+       long                    samples;
+       off_t                   post_processing_offset;
+};
+
+static void advance_output(struct perf_record *rec, size_t size)
 {
-       bytes_written += size;
+       rec->bytes_written += size;
 }
 
-static void write_output(void *buf, size_t size)
+static void write_output(struct perf_record *rec, void *buf, size_t size)
 {
        while (size) {
-               int ret = write(output, buf, size);
+               int ret = write(rec->output, buf, size);
 
                if (ret < 0)
                        die("failed to write");
@@ -91,30 +73,33 @@ static void write_output(void *buf, size_t size)
                size -= ret;
                buf += ret;
 
-               bytes_written += ret;
+               rec->bytes_written += ret;
        }
 }
 
-static int process_synthesized_event(union perf_event *event,
+static int process_synthesized_event(struct perf_tool *tool,
+                                    union perf_event *event,
                                     struct perf_sample *sample __used,
-                                    struct perf_session *self __used)
+                                    struct machine *machine __used)
 {
-       write_output(event, event->header.size);
+       struct perf_record *rec = container_of(tool, struct perf_record, tool);
+       write_output(rec, event, event->header.size);
        return 0;
 }
 
-static void mmap_read(struct perf_mmap *md)
+static void perf_record__mmap_read(struct perf_record *rec,
+                                  struct perf_mmap *md)
 {
        unsigned int head = perf_mmap__read_head(md);
        unsigned int old = md->prev;
-       unsigned char *data = md->base + page_size;
+       unsigned char *data = md->base + rec->page_size;
        unsigned long size;
        void *buf;
 
        if (old == head)
                return;
 
-       samples++;
+       rec->samples++;
 
        size = head - old;
 
@@ -123,14 +108,14 @@ static void mmap_read(struct perf_mmap *md)
                size = md->mask + 1 - (old & md->mask);
                old += size;
 
-               write_output(buf, size);
+               write_output(rec, buf, size);
        }
 
        buf = &data[old & md->mask];
        size = head - old;
        old += size;
 
-       write_output(buf, size);
+       write_output(rec, buf, size);
 
        md->prev = old;
        perf_mmap__write_tail(md, old);
@@ -149,17 +134,18 @@ static void sig_handler(int sig)
        signr = sig;
 }
 
-static void sig_atexit(void)
+static void perf_record__sig_exit(int exit_status __used, void *arg)
 {
+       struct perf_record *rec = arg;
        int status;
 
-       if (child_pid > 0) {
+       if (rec->evlist->workload.pid > 0) {
                if (!child_finished)
-                       kill(child_pid, SIGTERM);
+                       kill(rec->evlist->workload.pid, SIGTERM);
 
                wait(&status);
                if (WIFSIGNALED(status))
-                       psignal(WTERMSIG(status), progname);
+                       psignal(WTERMSIG(status), rec->progname);
        }
 
        if (signr == -1 || signr == SIGUSR1)
@@ -169,78 +155,6 @@ static void sig_atexit(void)
        kill(getpid(), signr);
 }
 
-static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist)
-{
-       struct perf_event_attr *attr = &evsel->attr;
-       int track = !evsel->idx; /* only the first counter needs these */
-
-       attr->disabled          = 1;
-       attr->inherit           = !no_inherit;
-       attr->read_format       = PERF_FORMAT_TOTAL_TIME_ENABLED |
-                                 PERF_FORMAT_TOTAL_TIME_RUNNING |
-                                 PERF_FORMAT_ID;
-
-       attr->sample_type       |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-
-       if (evlist->nr_entries > 1)
-               attr->sample_type |= PERF_SAMPLE_ID;
-
-       /*
-        * We default some events to a 1 default interval. But keep
-        * it a weak assumption overridable by the user.
-        */
-       if (!attr->sample_period || (user_freq != UINT_MAX &&
-                                    user_interval != ULLONG_MAX)) {
-               if (freq) {
-                       attr->sample_type       |= PERF_SAMPLE_PERIOD;
-                       attr->freq              = 1;
-                       attr->sample_freq       = freq;
-               } else {
-                       attr->sample_period = default_interval;
-               }
-       }
-
-       if (no_samples)
-               attr->sample_freq = 0;
-
-       if (inherit_stat)
-               attr->inherit_stat = 1;
-
-       if (sample_address) {
-               attr->sample_type       |= PERF_SAMPLE_ADDR;
-               attr->mmap_data = track;
-       }
-
-       if (call_graph)
-               attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
-
-       if (system_wide)
-               attr->sample_type       |= PERF_SAMPLE_CPU;
-
-       if (sample_id_all_avail &&
-           (sample_time || system_wide || !no_inherit || cpu_list))
-               attr->sample_type       |= PERF_SAMPLE_TIME;
-
-       if (raw_samples) {
-               attr->sample_type       |= PERF_SAMPLE_TIME;
-               attr->sample_type       |= PERF_SAMPLE_RAW;
-               attr->sample_type       |= PERF_SAMPLE_CPU;
-       }
-
-       if (nodelay) {
-               attr->watermark = 0;
-               attr->wakeup_events = 1;
-       }
-
-       attr->mmap              = track;
-       attr->comm              = track;
-
-       if (target_pid == -1 && target_tid == -1 && !system_wide) {
-               attr->disabled = 1;
-               attr->enable_on_exec = 1;
-       }
-}
-
 static bool perf_evlist__equal(struct perf_evlist *evlist,
                               struct perf_evlist *other)
 {
@@ -260,15 +174,17 @@ static bool perf_evlist__equal(struct perf_evlist *evlist,
        return true;
 }
 
-static void open_counters(struct perf_evlist *evlist)
+static void perf_record__open(struct perf_record *rec)
 {
        struct perf_evsel *pos, *first;
-
-       if (evlist->cpus->map[0] < 0)
-               no_inherit = true;
+       struct perf_evlist *evlist = rec->evlist;
+       struct perf_session *session = rec->session;
+       struct perf_record_opts *opts = &rec->opts;
 
        first = list_entry(evlist->entries.next, struct perf_evsel, node);
 
+       perf_evlist__config_attrs(evlist, opts);
+
        list_for_each_entry(pos, &evlist->entries, node) {
                struct perf_event_attr *attr = &pos->attr;
                struct xyarray *group_fd = NULL;
@@ -286,29 +202,27 @@ static void open_counters(struct perf_evlist *evlist)
                 */
                bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
 
-               if (group && pos != first)
+               if (opts->group && pos != first)
                        group_fd = first->fd;
-
-               config_attr(pos, evlist);
 retry_sample_id:
-               attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+               attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
 try_again:
-               if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group,
-                                    group_fd) < 0) {
+               if (perf_evsel__open(pos, evlist->cpus, evlist->threads,
+                                    opts->group, group_fd) < 0) {
                        int err = errno;
 
                        if (err == EPERM || err == EACCES) {
                                ui__error_paranoid();
                                exit(EXIT_FAILURE);
-                       } else if (err ==  ENODEV && cpu_list) {
+                       } else if (err ==  ENODEV && opts->cpu_list) {
                                die("No such device - did you specify"
                                        " an out-of-range profile CPU?\n");
-                       } else if (err == EINVAL && sample_id_all_avail) {
+                       } else if (err == EINVAL && opts->sample_id_all_avail) {
                                /*
                                 * Old kernel, no attr->sample_id_type_all field
                                 */
-                               sample_id_all_avail = false;
-                               if (!sample_time && !raw_samples && !time_needed)
+                               opts->sample_id_all_avail = false;
+                               if (!opts->sample_time && !opts->raw_samples && !time_needed)
                                        attr->sample_type &= ~PERF_SAMPLE_TIME;
 
                                goto retry_sample_id;
@@ -358,10 +272,20 @@ try_again:
                exit(-1);
        }
 
-       if (perf_evlist__mmap(evlist, mmap_pages, false) < 0)
+       if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+               if (errno == EPERM)
+                       die("Permission error mapping pages.\n"
+                           "Consider increasing "
+                           "/proc/sys/kernel/perf_event_mlock_kb,\n"
+                           "or try again with a smaller value of -m/--mmap_pages.\n"
+                           "(current value: %d)\n", opts->mmap_pages);
+               else if (!is_power_of_2(opts->mmap_pages))
+                       die("--mmap_pages/-m value must be a power of two.");
+
                die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+       }
 
-       if (file_new)
+       if (rec->file_new)
                session->evlist = evlist;
        else {
                if (!perf_evlist__equal(session->evlist, evlist)) {
@@ -373,29 +297,32 @@ try_again:
        perf_session__update_sample_type(session);
 }
 
-static int process_buildids(void)
+static int process_buildids(struct perf_record *rec)
 {
-       u64 size = lseek(output, 0, SEEK_CUR);
+       u64 size = lseek(rec->output, 0, SEEK_CUR);
 
        if (size == 0)
                return 0;
 
-       session->fd = output;
-       return __perf_session__process_events(session, post_processing_offset,
-                                             size - post_processing_offset,
+       rec->session->fd = rec->output;
+       return __perf_session__process_events(rec->session, rec->post_processing_offset,
+                                             size - rec->post_processing_offset,
                                              size, &build_id__mark_dso_hit_ops);
 }
 
-static void atexit_header(void)
+static void perf_record__exit(int status __used, void *arg)
 {
-       if (!pipe_output) {
-               session->header.data_size += bytes_written;
-
-               if (!no_buildid)
-                       process_buildids();
-               perf_session__write_header(session, evsel_list, output, true);
-               perf_session__delete(session);
-               perf_evlist__delete(evsel_list);
+       struct perf_record *rec = arg;
+
+       if (!rec->opts.pipe_output) {
+               rec->session->header.data_size += rec->bytes_written;
+
+               if (!rec->no_buildid)
+                       process_buildids(rec);
+               perf_session__write_header(rec->session, rec->evlist,
+                                          rec->output, true);
+               perf_session__delete(rec->session);
+               perf_evlist__delete(rec->evlist);
                symbol__exit();
        }
 }
@@ -403,7 +330,7 @@ static void atexit_header(void)
 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
 {
        int err;
-       struct perf_session *psession = data;
+       struct perf_tool *tool = data;
 
        if (machine__is_host(machine))
                return;
@@ -416,8 +343,8 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
         *method is used to avoid symbol missing when the first addr is
         *in module instead of in guest kernel.
         */
-       err = perf_event__synthesize_modules(process_synthesized_event,
-                                            psession, machine);
+       err = perf_event__synthesize_modules(tool, process_synthesized_event,
+                                            machine);
        if (err < 0)
                pr_err("Couldn't record guest kernel [%d]'s reference"
                       " relocation symbol.\n", machine->pid);
@@ -426,12 +353,11 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
         * We use _stext for guest kernel because guest kernel's /proc/kallsyms
         * have no _text sometimes.
         */
-       err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
-                                                psession, machine, "_text");
+       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                machine, "_text");
        if (err < 0)
-               err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
-                                                        psession, machine,
-                                                        "_stext");
+               err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                        machine, "_stext");
        if (err < 0)
                pr_err("Couldn't record guest kernel [%d]'s reference"
                       " relocation symbol.\n", machine->pid);
@@ -442,73 +368,71 @@ static struct perf_event_header finished_round_event = {
        .type = PERF_RECORD_FINISHED_ROUND,
 };
 
-static void mmap_read_all(void)
+static void perf_record__mmap_read_all(struct perf_record *rec)
 {
        int i;
 
-       for (i = 0; i < evsel_list->nr_mmaps; i++) {
-               if (evsel_list->mmap[i].base)
-                       mmap_read(&evsel_list->mmap[i]);
+       for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+               if (rec->evlist->mmap[i].base)
+                       perf_record__mmap_read(rec, &rec->evlist->mmap[i]);
        }
 
-       if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
-               write_output(&finished_round_event, sizeof(finished_round_event));
+       if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO))
+               write_output(rec, &finished_round_event, sizeof(finished_round_event));
 }
 
-static int __cmd_record(int argc, const char **argv)
+static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
 {
        struct stat st;
        int flags;
-       int err;
+       int err, output;
        unsigned long waking = 0;
-       int child_ready_pipe[2], go_pipe[2];
        const bool forks = argc > 0;
-       char buf;
        struct machine *machine;
+       struct perf_tool *tool = &rec->tool;
+       struct perf_record_opts *opts = &rec->opts;
+       struct perf_evlist *evsel_list = rec->evlist;
+       const char *output_name = rec->output_name;
+       struct perf_session *session;
 
-       progname = argv[0];
+       rec->progname = argv[0];
 
-       page_size = sysconf(_SC_PAGE_SIZE);
+       rec->page_size = sysconf(_SC_PAGE_SIZE);
 
-       atexit(sig_atexit);
+       on_exit(perf_record__sig_exit, rec);
        signal(SIGCHLD, sig_handler);
        signal(SIGINT, sig_handler);
        signal(SIGUSR1, sig_handler);
 
-       if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
-               perror("failed to create pipes");
-               exit(-1);
-       }
-
        if (!output_name) {
                if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
-                       pipe_output = 1;
+                       opts->pipe_output = true;
                else
-                       output_name = "perf.data";
+                       rec->output_name = output_name = "perf.data";
        }
        if (output_name) {
                if (!strcmp(output_name, "-"))
-                       pipe_output = 1;
+                       opts->pipe_output = true;
                else if (!stat(output_name, &st) && st.st_size) {
-                       if (write_mode == WRITE_FORCE) {
+                       if (rec->write_mode == WRITE_FORCE) {
                                char oldname[PATH_MAX];
                                snprintf(oldname, sizeof(oldname), "%s.old",
                                         output_name);
                                unlink(oldname);
                                rename(output_name, oldname);
                        }
-               } else if (write_mode == WRITE_APPEND) {
-                       write_mode = WRITE_FORCE;
+               } else if (rec->write_mode == WRITE_APPEND) {
+                       rec->write_mode = WRITE_FORCE;
                }
        }
 
        flags = O_CREAT|O_RDWR;
-       if (write_mode == WRITE_APPEND)
-               file_new = 0;
+       if (rec->write_mode == WRITE_APPEND)
+               rec->file_new = 0;
        else
                flags |= O_TRUNC;
 
-       if (pipe_output)
+       if (opts->pipe_output)
                output = STDOUT_FILENO;
        else
                output = open(output_name, flags, S_IRUSR | S_IWUSR);
@@ -517,17 +441,21 @@ static int __cmd_record(int argc, const char **argv)
                exit(-1);
        }
 
+       rec->output = output;
+
        session = perf_session__new(output_name, O_WRONLY,
-                                   write_mode == WRITE_FORCE, false, NULL);
+                                   rec->write_mode == WRITE_FORCE, false, NULL);
        if (session == NULL) {
                pr_err("Not enough memory for reading perf file header\n");
                return -1;
        }
 
-       if (!no_buildid)
+       rec->session = session;
+
+       if (!rec->no_buildid)
                perf_header__set_feat(&session->header, HEADER_BUILD_ID);
 
-       if (!file_new) {
+       if (!rec->file_new) {
                err = perf_session__read_header(session, output);
                if (err < 0)
                        goto out_delete_session;
@@ -549,94 +477,57 @@ static int __cmd_record(int argc, const char **argv)
        perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY);
        perf_header__set_feat(&session->header, HEADER_CPUID);
 
-       /* 512 kiB: default amount of unprivileged mlocked memory */
-       if (mmap_pages == UINT_MAX)
-               mmap_pages = (512 * 1024) / page_size;
-
        if (forks) {
-               child_pid = fork();
-               if (child_pid < 0) {
-                       perror("failed to fork");
-                       exit(-1);
-               }
-
-               if (!child_pid) {
-                       if (pipe_output)
-                               dup2(2, 1);
-                       close(child_ready_pipe[0]);
-                       close(go_pipe[1]);
-                       fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
-
-                       /*
-                        * Do a dummy execvp to get the PLT entry resolved,
-                        * so we avoid the resolver overhead on the real
-                        * execvp call.
-                        */
-                       execvp("", (char **)argv);
-
-                       /*
-                        * Tell the parent we're ready to go
-                        */
-                       close(child_ready_pipe[1]);
-
-                       /*
-                        * Wait until the parent tells us to go.
-                        */
-                       if (read(go_pipe[0], &buf, 1) == -1)
-                               perror("unable to read pipe");
-
-                       execvp(argv[0], (char **)argv);
-
-                       perror(argv[0]);
-                       kill(getppid(), SIGUSR1);
-                       exit(-1);
-               }
-
-               if (!system_wide && target_tid == -1 && target_pid == -1)
-                       evsel_list->threads->map[0] = child_pid;
-
-               close(child_ready_pipe[1]);
-               close(go_pipe[0]);
-               /*
-                * wait for child to settle
-                */
-               if (read(child_ready_pipe[0], &buf, 1) == -1) {
-                       perror("unable to read pipe");
-                       exit(-1);
+               err = perf_evlist__prepare_workload(evsel_list, opts, argv);
+               if (err < 0) {
+                       pr_err("Couldn't run the workload!\n");
+                       goto out_delete_session;
                }
-               close(child_ready_pipe[0]);
        }
 
-       open_counters(evsel_list);
+       perf_record__open(rec);
 
        /*
-        * perf_session__delete(session) will be called at atexit_header()
+        * perf_session__delete(session) will be called at perf_record__exit()
         */
-       atexit(atexit_header);
+       on_exit(perf_record__exit, rec);
 
-       if (pipe_output) {
+       if (opts->pipe_output) {
                err = perf_header__write_pipe(output);
                if (err < 0)
                        return err;
-       } else if (file_new) {
+       } else if (rec->file_new) {
                err = perf_session__write_header(session, evsel_list,
                                                 output, false);
                if (err < 0)
                        return err;
        }
 
-       post_processing_offset = lseek(output, 0, SEEK_CUR);
+       if (!!rec->no_buildid
+           && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
+               pr_err("Couldn't generating buildids. "
+                      "Use --no-buildid to profile anyway.\n");
+               return -1;
+       }
 
-       if (pipe_output) {
-               err = perf_session__synthesize_attrs(session,
-                                                    process_synthesized_event);
+       rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
+
+       machine = perf_session__find_host_machine(session);
+       if (!machine) {
+               pr_err("Couldn't find native kernel information.\n");
+               return -1;
+       }
+
+       if (opts->pipe_output) {
+               err = perf_event__synthesize_attrs(tool, session,
+                                                  process_synthesized_event);
                if (err < 0) {
                        pr_err("Couldn't synthesize attrs.\n");
                        return err;
                }
 
-               err = perf_event__synthesize_event_types(process_synthesized_event,
-                                                        session);
+               err = perf_event__synthesize_event_types(tool, process_synthesized_event,
+                                                        machine);
                if (err < 0) {
                        pr_err("Couldn't synthesize event_types.\n");
                        return err;
@@ -651,56 +542,49 @@ static int __cmd_record(int argc, const char **argv)
                         * return this more properly and also
                         * propagate errors that now are calling die()
                         */
-                       err = perf_event__synthesize_tracing_data(output, evsel_list,
-                                                                 process_synthesized_event,
-                                                                 session);
+                       err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
+                                                                 process_synthesized_event);
                        if (err <= 0) {
                                pr_err("Couldn't record tracing data.\n");
                                return err;
                        }
-                       advance_output(err);
+                       advance_output(rec, err);
                }
        }
 
-       machine = perf_session__find_host_machine(session);
-       if (!machine) {
-               pr_err("Couldn't find native kernel information.\n");
-               return -1;
-       }
-
-       err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
-                                                session, machine, "_text");
+       err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                machine, "_text");
        if (err < 0)
-               err = perf_event__synthesize_kernel_mmap(process_synthesized_event,
-                                                        session, machine, "_stext");
+               err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+                                                        machine, "_stext");
        if (err < 0)
                pr_err("Couldn't record kernel reference relocation symbol\n"
                       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
                       "Check /proc/kallsyms permission or run as root.\n");
 
-       err = perf_event__synthesize_modules(process_synthesized_event,
-                                            session, machine);
+       err = perf_event__synthesize_modules(tool, process_synthesized_event,
+                                            machine);
        if (err < 0)
                pr_err("Couldn't record kernel module information.\n"
                       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
                       "Check /proc/modules permission or run as root.\n");
 
        if (perf_guest)
-               perf_session__process_machines(session,
+               perf_session__process_machines(session, tool,
                                               perf_event__synthesize_guest_os);
 
-       if (!system_wide)
-               perf_event__synthesize_thread_map(evsel_list->threads,
+       if (!opts->system_wide)
+               perf_event__synthesize_thread_map(tool, evsel_list->threads,
                                                  process_synthesized_event,
-                                                 session);
+                                                 machine);
        else
-               perf_event__synthesize_threads(process_synthesized_event,
-                                              session);
+               perf_event__synthesize_threads(tool, process_synthesized_event,
+                                              machine);
 
-       if (realtime_prio) {
+       if (rec->realtime_prio) {
                struct sched_param param;
 
-               param.sched_priority = realtime_prio;
+               param.sched_priority = rec->realtime_prio;
                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
                        pr_err("Could not set realtime priority.\n");
                        exit(-1);
@@ -713,14 +597,14 @@ static int __cmd_record(int argc, const char **argv)
         * Let the child rip
         */
        if (forks)
-               close(go_pipe[1]);
+               perf_evlist__start_workload(evsel_list);
 
        for (;;) {
-               int hits = samples;
+               int hits = rec->samples;
 
-               mmap_read_all();
+               perf_record__mmap_read_all(rec);
 
-               if (hits == samples) {
+               if (hits == rec->samples) {
                        if (done)
                                break;
                        err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
@@ -741,9 +625,9 @@ static int __cmd_record(int argc, const char **argv)
         */
        fprintf(stderr,
                "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
-               (double)bytes_written / 1024.0 / 1024.0,
+               (double)rec->bytes_written / 1024.0 / 1024.0,
                output_name,
-               bytes_written / 24);
+               rec->bytes_written / 24);
 
        return 0;
 
@@ -758,58 +642,89 @@ static const char * const record_usage[] = {
        NULL
 };
 
-static bool force, append_file;
+/*
+ * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
+ * because we need to have access to it in perf_record__exit, that is called
+ * after cmd_record() exits, but since record_options need to be accessible to
+ * builtin-script, leave it here.
+ *
+ * At least we don't ouch it in all the other functions here directly.
+ *
+ * Just say no to tons of global variables, sigh.
+ */
+static struct perf_record record = {
+       .opts = {
+               .target_pid          = -1,
+               .target_tid          = -1,
+               .mmap_pages          = UINT_MAX,
+               .user_freq           = UINT_MAX,
+               .user_interval       = ULLONG_MAX,
+               .freq                = 1000,
+               .sample_id_all_avail = true,
+       },
+       .write_mode = WRITE_FORCE,
+       .file_new   = true,
+};
 
+/*
+ * XXX Will stay a global variable till we fix builtin-script.c to stop messing
+ * with it and switch to use the library functions in perf_evlist that came
+ * from builtin-record.c, i.e. use perf_record_opts,
+ * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
+ * using pipes, etc.
+ */
 const struct option record_options[] = {
-       OPT_CALLBACK('e', "event", &evsel_list, "event",
+       OPT_CALLBACK('e', "event", &record.evlist, "event",
                     "event selector. use 'perf list' to list available events",
                     parse_events_option),
-       OPT_CALLBACK(0, "filter", &evsel_list, "filter",
+       OPT_CALLBACK(0, "filter", &record.evlist, "filter",
                     "event filter", parse_filter),
-       OPT_INTEGER('p', "pid", &target_pid,
+       OPT_INTEGER('p', "pid", &record.opts.target_pid,
                    "record events on existing process id"),
-       OPT_INTEGER('t', "tid", &target_tid,
+       OPT_INTEGER('t', "tid", &record.opts.target_tid,
                    "record events on existing thread id"),
-       OPT_INTEGER('r', "realtime", &realtime_prio,
+       OPT_INTEGER('r', "realtime", &record.realtime_prio,
                    "collect data with this RT SCHED_FIFO priority"),
-       OPT_BOOLEAN('D', "no-delay", &nodelay,
+       OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
                    "collect data without buffering"),
-       OPT_BOOLEAN('R', "raw-samples", &raw_samples,
+       OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
                    "collect raw sample records from all opened counters"),
-       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+       OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide,
                            "system-wide collection from all CPUs"),
-       OPT_BOOLEAN('A', "append", &append_file,
+       OPT_BOOLEAN('A', "append", &record.append_file,
                            "append to the output file to do incremental profiling"),
-       OPT_STRING('C', "cpu", &cpu_list, "cpu",
+       OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu",
                    "list of cpus to monitor"),
-       OPT_BOOLEAN('f', "force", &force,
+       OPT_BOOLEAN('f', "force", &record.force,
                        "overwrite existing data file (deprecated)"),
-       OPT_U64('c', "count", &user_interval, "event period to sample"),
-       OPT_STRING('o', "output", &output_name, "file",
+       OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
+       OPT_STRING('o', "output", &record.output_name, "file",
                    "output file name"),
-       OPT_BOOLEAN('i', "no-inherit", &no_inherit,
+       OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
                    "child tasks do not inherit counters"),
-       OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
-       OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
-       OPT_BOOLEAN(0, "group", &group,
+       OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
+       OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
+                    "number of mmap data pages"),
+       OPT_BOOLEAN(0, "group", &record.opts.group,
                    "put the counters into a counter group"),
-       OPT_BOOLEAN('g', "call-graph", &call_graph,
+       OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph,
                    "do call-graph (stack chain/backtrace) recording"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show counter open errors, etc)"),
        OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
-       OPT_BOOLEAN('s', "stat", &inherit_stat,
+       OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
                    "per thread counts"),
-       OPT_BOOLEAN('d', "data", &sample_address,
+       OPT_BOOLEAN('d', "data", &record.opts.sample_address,
                    "Sample addresses"),
-       OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
-       OPT_BOOLEAN('n', "no-samples", &no_samples,
+       OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
+       OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
+       OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
                    "don't sample"),
-       OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
+       OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
                    "do not update the buildid cache"),
-       OPT_BOOLEAN('B', "no-buildid", &no_buildid,
+       OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
                    "do not collect buildids in perf.data"),
-       OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
+       OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
                     "monitor event in cgroup name only",
                     parse_cgroups),
        OPT_END()
@@ -819,6 +734,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
        int err = -ENOMEM;
        struct perf_evsel *pos;
+       struct perf_evlist *evsel_list;
+       struct perf_record *rec = &record;
 
        perf_header__set_cmdline(argc, argv);
 
@@ -826,23 +743,25 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
        if (evsel_list == NULL)
                return -ENOMEM;
 
+       rec->evlist = evsel_list;
+
        argc = parse_options(argc, argv, record_options, record_usage,
                            PARSE_OPT_STOP_AT_NON_OPTION);
-       if (!argc && target_pid == -1 && target_tid == -1 &&
-               !system_wide && !cpu_list)
+       if (!argc && rec->opts.target_pid == -1 && rec->opts.target_tid == -1 &&
+               !rec->opts.system_wide && !rec->opts.cpu_list)
                usage_with_options(record_usage, record_options);
 
-       if (force && append_file) {
+       if (rec->force && rec->append_file) {
                fprintf(stderr, "Can't overwrite and append at the same time."
                                " You need to choose between -f and -A");
                usage_with_options(record_usage, record_options);
-       } else if (append_file) {
-               write_mode = WRITE_APPEND;
+       } else if (rec->append_file) {
+               rec->write_mode = WRITE_APPEND;
        } else {
-               write_mode = WRITE_FORCE;
+               rec->write_mode = WRITE_FORCE;
        }
 
-       if (nr_cgroups && !system_wide) {
+       if (nr_cgroups && !rec->opts.system_wide) {
                fprintf(stderr, "cgroup monitoring only available in"
                        " system-wide mode\n");
                usage_with_options(record_usage, record_options);
@@ -860,7 +779,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
 "even with a suitable vmlinux or kallsyms file.\n\n");
 
-       if (no_buildid_cache || no_buildid)
+       if (rec->no_buildid_cache || rec->no_buildid)
                disable_buildid_cache();
 
        if (evsel_list->nr_entries == 0 &&
@@ -869,43 +788,37 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
                goto out_symbol_exit;
        }
 
-       if (target_pid != -1)
-               target_tid = target_pid;
+       if (rec->opts.target_pid != -1)
+               rec->opts.target_tid = rec->opts.target_pid;
 
-       if (perf_evlist__create_maps(evsel_list, target_pid,
-                                    target_tid, cpu_list) < 0)
+       if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid,
+                                    rec->opts.target_tid, rec->opts.cpu_list) < 0)
                usage_with_options(record_usage, record_options);
 
        list_for_each_entry(pos, &evsel_list->entries, node) {
-               if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr,
-                                        evsel_list->threads->nr) < 0)
-                       goto out_free_fd;
                if (perf_header__push_event(pos->attr.config, event_name(pos)))
                        goto out_free_fd;
        }
 
-       if (perf_evlist__alloc_pollfd(evsel_list) < 0)
-               goto out_free_fd;
-
-       if (user_interval != ULLONG_MAX)
-               default_interval = user_interval;
-       if (user_freq != UINT_MAX)
-               freq = user_freq;
+       if (rec->opts.user_interval != ULLONG_MAX)
+               rec->opts.default_interval = rec->opts.user_interval;
+       if (rec->opts.user_freq != UINT_MAX)
+               rec->opts.freq = rec->opts.user_freq;
 
        /*
         * User specified count overrides default frequency.
         */
-       if (default_interval)
-               freq = 0;
-       else if (freq) {
-               default_interval = freq;
+       if (rec->opts.default_interval)
+               rec->opts.freq = 0;
+       else if (rec->opts.freq) {
+               rec->opts.default_interval = rec->opts.freq;
        } else {
                fprintf(stderr, "frequency and count are zero, aborting\n");
                err = -EINVAL;
                goto out_free_fd;
        }
 
-       err = __cmd_record(argc, argv);
+       err = __cmd_record(&record, argc, argv);
 out_free_fd:
        perf_evlist__delete_maps(evsel_list);
 out_symbol_exit:
index 4d7c834..25d34d4 100644 (file)
@@ -25,6 +25,7 @@
 #include "util/evsel.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 
 #include <linux/bitmap.h>
 
-static char            const *input_name = "perf.data";
-
-static bool            force, use_tui, use_stdio;
-static bool            hide_unresolved;
-static bool            dont_use_callchains;
-static bool            show_full_info;
-
-static bool            show_threads;
-static struct perf_read_values show_threads_values;
-
-static const char      default_pretty_printing_style[] = "normal";
-static const char      *pretty_printing_style = default_pretty_printing_style;
-
-static char            callchain_default_opt[] = "fractal,0.5,callee";
-static bool            inverted_callchain;
-static symbol_filter_t annotate_init;
-
-static const char      *cpu_list;
-static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+struct perf_report {
+       struct perf_tool        tool;
+       struct perf_session     *session;
+       char const              *input_name;
+       bool                    force, use_tui, use_stdio;
+       bool                    hide_unresolved;
+       bool                    dont_use_callchains;
+       bool                    show_full_info;
+       bool                    show_threads;
+       bool                    inverted_callchain;
+       struct perf_read_values show_threads_values;
+       const char              *pretty_printing_style;
+       symbol_filter_t         annotate_init;
+       const char              *cpu_list;
+       DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+};
 
-static int perf_session__add_hist_entry(struct perf_session *session,
-                                       struct addr_location *al,
-                                       struct perf_sample *sample,
-                                       struct perf_evsel *evsel)
+static int perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+                                     struct addr_location *al,
+                                     struct perf_sample *sample,
+                                     struct machine *machine)
 {
        struct symbol *parent = NULL;
        int err = 0;
        struct hist_entry *he;
 
        if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) {
-               err = perf_session__resolve_callchain(session, al->thread,
-                                                     sample->callchain, &parent);
+               err = machine__resolve_callchain(machine, evsel, al->thread,
+                                                sample->callchain, &parent);
                if (err)
                        return err;
        }
@@ -76,7 +74,8 @@ static int perf_session__add_hist_entry(struct perf_session *session,
                return -ENOMEM;
 
        if (symbol_conf.use_callchain) {
-               err = callchain_append(he->callchain, &session->callchain_cursor,
+               err = callchain_append(he->callchain,
+                                      &evsel->hists.callchain_cursor,
                                       sample->period);
                if (err)
                        return err;
@@ -92,8 +91,7 @@ static int perf_session__add_hist_entry(struct perf_session *session,
                assert(evsel != NULL);
 
                err = -ENOMEM;
-               if (notes->src == NULL &&
-                   symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0)
+               if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0)
                        goto out;
 
                err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
@@ -106,30 +104,32 @@ out:
 }
 
 
-static int process_sample_event(union perf_event *event,
+static int process_sample_event(struct perf_tool *tool,
+                               union perf_event *event,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel,
-                               struct perf_session *session)
+                               struct machine *machine)
 {
+       struct perf_report *rep = container_of(tool, struct perf_report, tool);
        struct addr_location al;
 
-       if (perf_event__preprocess_sample(event, session, &al, sample,
-                                         annotate_init) < 0) {
+       if (perf_event__preprocess_sample(event, machine, &al, sample,
+                                         rep->annotate_init) < 0) {
                fprintf(stderr, "problem processing %d event, skipping it.\n",
                        event->header.type);
                return -1;
        }
 
-       if (al.filtered || (hide_unresolved && al.sym == NULL))
+       if (al.filtered || (rep->hide_unresolved && al.sym == NULL))
                return 0;
 
-       if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
+       if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
                return 0;
 
        if (al.map != NULL)
                al.map->dso->hit = 1;
 
-       if (perf_session__add_hist_entry(session, &al, sample, evsel)) {
+       if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) {
                pr_debug("problem incrementing symbol period, skipping event\n");
                return -1;
        }
@@ -137,15 +137,17 @@ static int process_sample_event(union perf_event *event,
        return 0;
 }
 
-static int process_read_event(union perf_event *event,
+static int process_read_event(struct perf_tool *tool,
+                             union perf_event *event,
                              struct perf_sample *sample __used,
-                             struct perf_session *session)
+                             struct perf_evsel *evsel,
+                             struct machine *machine __used)
 {
-       struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist,
-                                                        event->read.id);
-       if (show_threads) {
+       struct perf_report *rep = container_of(tool, struct perf_report, tool);
+
+       if (rep->show_threads) {
                const char *name = evsel ? event_name(evsel) : "unknown";
-               perf_read_values_add_value(&show_threads_values,
+               perf_read_values_add_value(&rep->show_threads_values,
                                           event->read.pid, event->read.tid,
                                           event->read.id,
                                           name,
@@ -159,8 +161,10 @@ static int process_read_event(union perf_event *event,
        return 0;
 }
 
-static int perf_session__setup_sample_type(struct perf_session *self)
+static int perf_report__setup_sample_type(struct perf_report *rep)
 {
+       struct perf_session *self = rep->session;
+
        if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) {
                if (sort__has_parent) {
                        ui__warning("Selected --sort parent, but no "
@@ -173,7 +177,8 @@ static int perf_session__setup_sample_type(struct perf_session *self)
                                    "you call 'perf record' without -g?\n");
                        return -1;
                }
-       } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE &&
+       } else if (!rep->dont_use_callchains &&
+                  callchain_param.mode != CHAIN_NONE &&
                   !symbol_conf.use_callchain) {
                        symbol_conf.use_callchain = true;
                        if (callchain_register_param(&callchain_param) < 0) {
@@ -186,22 +191,6 @@ static int perf_session__setup_sample_type(struct perf_session *self)
        return 0;
 }
 
-static struct perf_event_ops event_ops = {
-       .sample          = process_sample_event,
-       .mmap            = perf_event__process_mmap,
-       .comm            = perf_event__process_comm,
-       .exit            = perf_event__process_task,
-       .fork            = perf_event__process_task,
-       .lost            = perf_event__process_lost,
-       .read            = process_read_event,
-       .attr            = perf_event__process_attr,
-       .event_type      = perf_event__process_event_type,
-       .tracing_data    = perf_event__process_tracing_data,
-       .build_id        = perf_event__process_build_id,
-       .ordered_samples = true,
-       .ordering_requires_timestamps = true,
-};
-
 extern volatile int session_done;
 
 static void sig_handler(int sig __used)
@@ -224,6 +213,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self,
 }
 
 static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
+                                        struct perf_report *rep,
                                         const char *help)
 {
        struct perf_evsel *pos;
@@ -241,18 +231,18 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
            parent_pattern == default_parent_pattern) {
                fprintf(stdout, "#\n# (%s)\n#\n", help);
 
-               if (show_threads) {
-                       bool style = !strcmp(pretty_printing_style, "raw");
-                       perf_read_values_display(stdout, &show_threads_values,
+               if (rep->show_threads) {
+                       bool style = !strcmp(rep->pretty_printing_style, "raw");
+                       perf_read_values_display(stdout, &rep->show_threads_values,
                                                 style);
-                       perf_read_values_destroy(&show_threads_values);
+                       perf_read_values_destroy(&rep->show_threads_values);
                }
        }
 
        return 0;
 }
 
-static int __cmd_report(void)
+static int __cmd_report(struct perf_report *rep)
 {
        int ret = -EINVAL;
        u64 nr_samples;
@@ -264,27 +254,31 @@ static int __cmd_report(void)
 
        signal(SIGINT, sig_handler);
 
-       session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
+       session = perf_session__new(rep->input_name, O_RDONLY,
+                                   rep->force, false, &rep->tool);
        if (session == NULL)
                return -ENOMEM;
 
-       if (cpu_list) {
-               ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap);
+       rep->session = session;
+
+       if (rep->cpu_list) {
+               ret = perf_session__cpu_bitmap(session, rep->cpu_list,
+                                              rep->cpu_bitmap);
                if (ret)
                        goto out_delete;
        }
 
        if (use_browser <= 0)
-               perf_session__fprintf_info(session, stdout, show_full_info);
+               perf_session__fprintf_info(session, stdout, rep->show_full_info);
 
-       if (show_threads)
-               perf_read_values_init(&show_threads_values);
+       if (rep->show_threads)
+               perf_read_values_init(&rep->show_threads_values);
 
-       ret = perf_session__setup_sample_type(session);
+       ret = perf_report__setup_sample_type(rep);
        if (ret)
                goto out_delete;
 
-       ret = perf_session__process_events(session, &event_ops);
+       ret = perf_session__process_events(session, &rep->tool);
        if (ret)
                goto out_delete;
 
@@ -327,7 +321,7 @@ static int __cmd_report(void)
        }
 
        if (nr_samples == 0) {
-               ui__warning("The %s file has no samples!\n", input_name);
+               ui__warning("The %s file has no samples!\n", session->filename);
                goto out_delete;
        }
 
@@ -335,7 +329,7 @@ static int __cmd_report(void)
                perf_evlist__tui_browse_hists(session->evlist, help,
                                              NULL, NULL, 0);
        } else
-               perf_evlist__tty_browse_hists(session->evlist, help);
+               perf_evlist__tty_browse_hists(session->evlist, rep, help);
 
 out_delete:
        /*
@@ -354,9 +348,9 @@ out_delete:
 }
 
 static int
-parse_callchain_opt(const struct option *opt __used, const char *arg,
-                   int unset)
+parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
+       struct perf_report *rep = (struct perf_report *)opt->value;
        char *tok, *tok2;
        char *endptr;
 
@@ -364,7 +358,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
         * --no-call-graph
         */
        if (unset) {
-               dont_use_callchains = true;
+               rep->dont_use_callchains = true;
                return 0;
        }
 
@@ -412,7 +406,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
                goto setup;
 
        if (tok2[0] != 'c') {
-               callchain_param.print_limit = strtod(tok2, &endptr);
+               callchain_param.print_limit = strtoul(tok2, &endptr, 0);
                tok2 = strtok(NULL, ",");
                if (!tok2)
                        goto setup;
@@ -433,13 +427,34 @@ setup:
        return 0;
 }
 
-static const char * const report_usage[] = {
-       "perf report [<options>] <command>",
-       NULL
-};
-
-static const struct option options[] = {
-       OPT_STRING('i', "input", &input_name, "file",
+int cmd_report(int argc, const char **argv, const char *prefix __used)
+{
+       struct stat st;
+       char callchain_default_opt[] = "fractal,0.5,callee";
+       const char * const report_usage[] = {
+               "perf report [<options>]",
+               NULL
+       };
+       struct perf_report report = {
+               .tool = {
+                       .sample          = process_sample_event,
+                       .mmap            = perf_event__process_mmap,
+                       .comm            = perf_event__process_comm,
+                       .exit            = perf_event__process_task,
+                       .fork            = perf_event__process_task,
+                       .lost            = perf_event__process_lost,
+                       .read            = process_read_event,
+                       .attr            = perf_event__process_attr,
+                       .event_type      = perf_event__process_event_type,
+                       .tracing_data    = perf_event__process_tracing_data,
+                       .build_id        = perf_event__process_build_id,
+                       .ordered_samples = true,
+                       .ordering_requires_timestamps = true,
+               },
+               .pretty_printing_style   = "normal",
+       };
+       const struct option options[] = {
+       OPT_STRING('i', "input", &report.input_name, "file",
                    "input file name"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
@@ -449,17 +464,18 @@ static const struct option options[] = {
                   "file", "vmlinux pathname"),
        OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
                   "file", "kallsyms pathname"),
-       OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+       OPT_BOOLEAN('f', "force", &report.force, "don't complain, do it"),
        OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
                    "load module symbols - WARNING: use only with -k and LIVE kernel"),
        OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
                    "Show a column with the number of samples"),
-       OPT_BOOLEAN('T', "threads", &show_threads,
+       OPT_BOOLEAN('T', "threads", &report.show_threads,
                    "Show per-thread event counters"),
-       OPT_STRING(0, "pretty", &pretty_printing_style, "key",
+       OPT_STRING(0, "pretty", &report.pretty_printing_style, "key",
                   "pretty printing style key: normal raw"),
-       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
-       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
+       OPT_BOOLEAN(0, "tui", &report.use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &report.use_stdio,
+                   "Use the stdio interface"),
        OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
                   "sort by key(s): pid, comm, dso, symbol, parent"),
        OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
@@ -468,13 +484,14 @@ static const struct option options[] = {
                   "regex filter to identify parent, see: '--sort parent'"),
        OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
                    "Only display entries with parent-match"),
-       OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent, call_order",
-                    "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold and callchain order. "
+       OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
+                    "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit and callchain order. "
                     "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt),
-       OPT_BOOLEAN('G', "inverted", &inverted_callchain, "alias for inverted call graph"),
+       OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
+                   "alias for inverted call graph"),
        OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
                   "only consider symbols in these dsos"),
-       OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+       OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
                   "only consider symbols in these comms"),
        OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
                   "only consider these symbols"),
@@ -484,12 +501,13 @@ static const struct option options[] = {
        OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
                   "separator for columns, no spaces will be added between "
                   "columns '.' is reserved."),
-       OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
+       OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved,
                    "Only display entries resolved to a symbol"),
        OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
                    "Look for files with symbols relative to this directory"),
-       OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
-       OPT_BOOLEAN('I', "show-info", &show_full_info,
+       OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
+                  "list of cpus to profile"),
+       OPT_BOOLEAN('I', "show-info", &report.show_full_info,
                    "Display extended information about perf.data file"),
        OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
                    "Interleave source code with assembly code (default)"),
@@ -500,24 +518,30 @@ static const struct option options[] = {
        OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
                    "Show a column with the sum of periods"),
        OPT_END()
-};
+       };
 
-int cmd_report(int argc, const char **argv, const char *prefix __used)
-{
        argc = parse_options(argc, argv, options, report_usage, 0);
 
-       if (use_stdio)
+       if (report.use_stdio)
                use_browser = 0;
-       else if (use_tui)
+       else if (report.use_tui)
                use_browser = 1;
 
-       if (inverted_callchain)
+       if (report.inverted_callchain)
                callchain_param.order = ORDER_CALLER;
 
-       if (strcmp(input_name, "-") != 0)
+       if (!report.input_name || !strlen(report.input_name)) {
+               if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
+                       report.input_name = "-";
+               else
+                       report.input_name = "perf.data";
+       }
+
+       if (strcmp(report.input_name, "-") != 0)
                setup_browser(true);
        else
                use_browser = 0;
+
        /*
         * Only in the newt browser we are doing integrated annotation,
         * so don't allocate extra space that won't be used in the stdio
@@ -525,7 +549,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
         */
        if (use_browser > 0) {
                symbol_conf.priv_size = sizeof(struct annotation);
-               annotate_init         = symbol__annotate_init;
+               report.annotate_init  = symbol__annotate_init;
                /*
                 * For searching by name on the "Browse map details".
                 * providing it only in verbose mode not to bloat too
@@ -572,5 +596,5 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
        sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout);
        sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
 
-       return __cmd_report();
+       return __cmd_report(&report);
 }
index 5177964..fb8b5f8 100644 (file)
@@ -2,11 +2,14 @@
 #include "perf.h"
 
 #include "util/util.h"
+#include "util/evlist.h"
 #include "util/cache.h"
+#include "util/evsel.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
@@ -19,7 +22,7 @@
 #include <pthread.h>
 #include <math.h>
 
-static char                    const *input_name = "perf.data";
+static const char              *input_name;
 
 static char                    default_sort_order[] = "avg, max, switch, runtime";
 static const char              *sort_order = default_sort_order;
@@ -723,21 +726,21 @@ struct trace_migrate_task_event {
 
 struct trace_sched_handler {
        void (*switch_event)(struct trace_switch_event *,
-                            struct perf_session *,
+                            struct machine *,
                             struct event *,
                             int cpu,
                             u64 timestamp,
                             struct thread *thread);
 
        void (*runtime_event)(struct trace_runtime_event *,
-                             struct perf_session *,
+                             struct machine *,
                              struct event *,
                              int cpu,
                              u64 timestamp,
                              struct thread *thread);
 
        void (*wakeup_event)(struct trace_wakeup_event *,
-                            struct perf_session *,
+                            struct machine *,
                             struct event *,
                             int cpu,
                             u64 timestamp,
@@ -750,7 +753,7 @@ struct trace_sched_handler {
                           struct thread *thread);
 
        void (*migrate_task_event)(struct trace_migrate_task_event *,
-                          struct perf_session *session,
+                          struct machine *machine,
                           struct event *,
                           int cpu,
                           u64 timestamp,
@@ -760,7 +763,7 @@ struct trace_sched_handler {
 
 static void
 replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
-                   struct perf_session *session __used,
+                   struct machine *machine __used,
                    struct event *event,
                    int cpu __used,
                    u64 timestamp __used,
@@ -787,7 +790,7 @@ static u64 cpu_last_switched[MAX_CPUS];
 
 static void
 replay_switch_event(struct trace_switch_event *switch_event,
-                   struct perf_session *session __used,
+                   struct machine *machine __used,
                    struct event *event,
                    int cpu,
                    u64 timestamp,
@@ -1021,7 +1024,7 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
 
 static void
 latency_switch_event(struct trace_switch_event *switch_event,
-                    struct perf_session *session,
+                    struct machine *machine,
                     struct event *event __used,
                     int cpu,
                     u64 timestamp,
@@ -1045,8 +1048,8 @@ latency_switch_event(struct trace_switch_event *switch_event,
                die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 
 
-       sched_out = perf_session__findnew(session, switch_event->prev_pid);
-       sched_in = perf_session__findnew(session, switch_event->next_pid);
+       sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
+       sched_in = machine__findnew_thread(machine, switch_event->next_pid);
 
        out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
        if (!out_events) {
@@ -1074,13 +1077,13 @@ latency_switch_event(struct trace_switch_event *switch_event,
 
 static void
 latency_runtime_event(struct trace_runtime_event *runtime_event,
-                    struct perf_session *session,
+                    struct machine *machine,
                     struct event *event __used,
                     int cpu,
                     u64 timestamp,
                     struct thread *this_thread __used)
 {
-       struct thread *thread = perf_session__findnew(session, runtime_event->pid);
+       struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
        struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
 
        BUG_ON(cpu >= MAX_CPUS || cpu < 0);
@@ -1097,7 +1100,7 @@ latency_runtime_event(struct trace_runtime_event *runtime_event,
 
 static void
 latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
-                    struct perf_session *session,
+                    struct machine *machine,
                     struct event *__event __used,
                     int cpu __used,
                     u64 timestamp,
@@ -1111,7 +1114,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
        if (!wakeup_event->success)
                return;
 
-       wakee = perf_session__findnew(session, wakeup_event->pid);
+       wakee = machine__findnew_thread(machine, wakeup_event->pid);
        atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
        if (!atoms) {
                thread_atoms_insert(wakee);
@@ -1145,7 +1148,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 
 static void
 latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
-                    struct perf_session *session,
+                    struct machine *machine,
                     struct event *__event __used,
                     int cpu __used,
                     u64 timestamp,
@@ -1161,7 +1164,7 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
        if (profile_cpu == -1)
                return;
 
-       migrant = perf_session__findnew(session, migrate_task_event->pid);
+       migrant = machine__findnew_thread(machine, migrate_task_event->pid);
        atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
        if (!atoms) {
                thread_atoms_insert(migrant);
@@ -1356,12 +1359,13 @@ static void sort_lat(void)
 static struct trace_sched_handler *trace_handler;
 
 static void
-process_sched_wakeup_event(void *data, struct perf_session *session,
+process_sched_wakeup_event(struct perf_tool *tool __used,
                           struct event *event,
-                          int cpu __used,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+                          struct perf_sample *sample,
+                          struct machine *machine,
+                          struct thread *thread)
 {
+       void *data = sample->raw_data;
        struct trace_wakeup_event wakeup_event;
 
        FILL_COMMON_FIELDS(wakeup_event, event, data);
@@ -1373,8 +1377,8 @@ process_sched_wakeup_event(void *data, struct perf_session *session,
        FILL_FIELD(wakeup_event, cpu, event, data);
 
        if (trace_handler->wakeup_event)
-               trace_handler->wakeup_event(&wakeup_event, session, event,
-                                           cpu, timestamp, thread);
+               trace_handler->wakeup_event(&wakeup_event, machine, event,
+                                           sample->cpu, sample->time, thread);
 }
 
 /*
@@ -1392,7 +1396,7 @@ static char next_shortname2 = '0';
 
 static void
 map_switch_event(struct trace_switch_event *switch_event,
-                struct perf_session *session,
+                struct machine *machine,
                 struct event *event __used,
                 int this_cpu,
                 u64 timestamp,
@@ -1420,8 +1424,8 @@ map_switch_event(struct trace_switch_event *switch_event,
                die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
 
 
-       sched_out = perf_session__findnew(session, switch_event->prev_pid);
-       sched_in = perf_session__findnew(session, switch_event->next_pid);
+       sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
+       sched_in = machine__findnew_thread(machine, switch_event->next_pid);
 
        curr_thread[this_cpu] = sched_in;
 
@@ -1469,14 +1473,15 @@ map_switch_event(struct trace_switch_event *switch_event,
        }
 }
 
-
 static void
-process_sched_switch_event(void *data, struct perf_session *session,
+process_sched_switch_event(struct perf_tool *tool __used,
                           struct event *event,
-                          int this_cpu,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+                          struct perf_sample *sample,
+                          struct machine *machine,
+                          struct thread *thread)
 {
+       int this_cpu = sample->cpu;
+       void *data = sample->raw_data;
        struct trace_switch_event switch_event;
 
        FILL_COMMON_FIELDS(switch_event, event, data);
@@ -1498,19 +1503,20 @@ process_sched_switch_event(void *data, struct perf_session *session,
                        nr_context_switch_bugs++;
        }
        if (trace_handler->switch_event)
-               trace_handler->switch_event(&switch_event, session, event,
-                                           this_cpu, timestamp, thread);
+               trace_handler->switch_event(&switch_event, machine, event,
+                                           this_cpu, sample->time, thread);
 
        curr_pid[this_cpu] = switch_event.next_pid;
 }
 
 static void
-process_sched_runtime_event(void *data, struct perf_session *session,
-                          struct event *event,
-                          int cpu __used,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+process_sched_runtime_event(struct perf_tool *tool __used,
+                           struct event *event,
+                           struct perf_sample *sample,
+                           struct machine *machine,
+                           struct thread *thread)
 {
+       void *data = sample->raw_data;
        struct trace_runtime_event runtime_event;
 
        FILL_ARRAY(runtime_event, comm, event, data);
@@ -1519,16 +1525,18 @@ process_sched_runtime_event(void *data, struct perf_session *session,
        FILL_FIELD(runtime_event, vruntime, event, data);
 
        if (trace_handler->runtime_event)
-               trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread);
+               trace_handler->runtime_event(&runtime_event, machine, event,
+                                            sample->cpu, sample->time, thread);
 }
 
 static void
-process_sched_fork_event(void *data,
+process_sched_fork_event(struct perf_tool *tool __used,
                         struct event *event,
-                        int cpu __used,
-                        u64 timestamp __used,
-                        struct thread *thread __used)
+                        struct perf_sample *sample,
+                        struct machine *machine __used,
+                        struct thread *thread)
 {
+       void *data = sample->raw_data;
        struct trace_fork_event fork_event;
 
        FILL_COMMON_FIELDS(fork_event, event, data);
@@ -1540,13 +1548,14 @@ process_sched_fork_event(void *data,
 
        if (trace_handler->fork_event)
                trace_handler->fork_event(&fork_event, event,
-                                         cpu, timestamp, thread);
+                                         sample->cpu, sample->time, thread);
 }
 
 static void
-process_sched_exit_event(struct event *event,
-                        int cpu __used,
-                        u64 timestamp __used,
+process_sched_exit_event(struct perf_tool *tool __used,
+                        struct event *event,
+                        struct perf_sample *sample __used,
+                        struct machine *machine __used,
                         struct thread *thread __used)
 {
        if (verbose)
@@ -1554,12 +1563,13 @@ process_sched_exit_event(struct event *event,
 }
 
 static void
-process_sched_migrate_task_event(void *data, struct perf_session *session,
-                          struct event *event,
-                          int cpu __used,
-                          u64 timestamp __used,
-                          struct thread *thread __used)
+process_sched_migrate_task_event(struct perf_tool *tool __used,
+                                struct event *event,
+                                struct perf_sample *sample,
+                                struct machine *machine,
+                                struct thread *thread)
 {
+       void *data = sample->raw_data;
        struct trace_migrate_task_event migrate_task_event;
 
        FILL_COMMON_FIELDS(migrate_task_event, event, data);
@@ -1570,67 +1580,47 @@ process_sched_migrate_task_event(void *data, struct perf_session *session,
        FILL_FIELD(migrate_task_event, cpu, event, data);
 
        if (trace_handler->migrate_task_event)
-               trace_handler->migrate_task_event(&migrate_task_event, session,
-                                                event, cpu, timestamp, thread);
+               trace_handler->migrate_task_event(&migrate_task_event, machine,
+                                                 event, sample->cpu,
+                                                 sample->time, thread);
 }
 
-static void process_raw_event(union perf_event *raw_event __used,
-                             struct perf_session *session, void *data, int cpu,
-                             u64 timestamp, struct thread *thread)
-{
-       struct event *event;
-       int type;
-
-
-       type = trace_parse_common_type(data);
-       event = trace_find_event(type);
-
-       if (!strcmp(event->name, "sched_switch"))
-               process_sched_switch_event(data, session, event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_stat_runtime"))
-               process_sched_runtime_event(data, session, event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_wakeup"))
-               process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_wakeup_new"))
-               process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_process_fork"))
-               process_sched_fork_event(data, event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_process_exit"))
-               process_sched_exit_event(event, cpu, timestamp, thread);
-       if (!strcmp(event->name, "sched_migrate_task"))
-               process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
-}
+typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event,
+                                  struct perf_sample *sample,
+                                  struct machine *machine,
+                                  struct thread *thread);
 
-static int process_sample_event(union perf_event *event,
-                               struct perf_sample *sample,
-                               struct perf_evsel *evsel __used,
-                               struct perf_session *session)
+static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
+                                                union perf_event *event __used,
+                                                struct perf_sample *sample,
+                                                struct perf_evsel *evsel,
+                                                struct machine *machine)
 {
-       struct thread *thread;
-
-       if (!(session->sample_type & PERF_SAMPLE_RAW))
-               return 0;
+       struct thread *thread = machine__findnew_thread(machine, sample->pid);
 
-       thread = perf_session__findnew(session, sample->pid);
        if (thread == NULL) {
-               pr_debug("problem processing %d event, skipping it.\n",
-                        event->header.type);
+               pr_debug("problem processing %s event, skipping it.\n",
+                        evsel->name);
                return -1;
        }
 
-       dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+       evsel->hists.stats.total_period += sample->period;
+       hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
 
-       if (profile_cpu != -1 && profile_cpu != (int)sample->cpu)
-               return 0;
+       if (evsel->handler.func != NULL) {
+               tracepoint_handler f = evsel->handler.func;
 
-       process_raw_event(event, session, sample->raw_data, sample->cpu,
-                         sample->time, thread);
+               if (evsel->handler.data == NULL)
+                       evsel->handler.data = trace_find_event(evsel->attr.config);
+
+               f(tool, evsel->handler.data, sample, machine, thread);
+       }
 
        return 0;
 }
 
-static struct perf_event_ops event_ops = {
-       .sample                 = process_sample_event,
+static struct perf_tool perf_sched = {
+       .sample                 = perf_sched__process_tracepoint_sample,
        .comm                   = perf_event__process_comm,
        .lost                   = perf_event__process_lost,
        .fork                   = perf_event__process_task,
@@ -1640,13 +1630,25 @@ static struct perf_event_ops event_ops = {
 static void read_events(bool destroy, struct perf_session **psession)
 {
        int err = -EINVAL;
+       const struct perf_evsel_str_handler handlers[] = {
+               { "sched:sched_switch",       process_sched_switch_event, },
+               { "sched:sched_stat_runtime", process_sched_runtime_event, },
+               { "sched:sched_wakeup",       process_sched_wakeup_event, },
+               { "sched:sched_wakeup_new",   process_sched_wakeup_event, },
+               { "sched:sched_process_fork", process_sched_fork_event, },
+               { "sched:sched_process_exit", process_sched_exit_event, },
+               { "sched:sched_migrate_task", process_sched_migrate_task_event, },
+       };
        struct perf_session *session = perf_session__new(input_name, O_RDONLY,
-                                                        0, false, &event_ops);
+                                                        0, false, &perf_sched);
        if (session == NULL)
                die("No Memory");
 
+       err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers);
+       assert(err == 0);
+
        if (perf_session__has_traces(session, "record -R")) {
-               err = perf_session__process_events(session, &event_ops);
+               err = perf_session__process_events(session, &perf_sched);
                if (err)
                        die("Failed to process events, error %d", err);
 
index 2f62a29..fd1909a 100644 (file)
@@ -7,6 +7,7 @@
 #include "util/header.h"
 #include "util/parse-options.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/trace-event.h"
@@ -23,6 +24,7 @@ static u64                    nr_unordered;
 extern const struct option     record_options[];
 static bool                    no_callchain;
 static bool                    show_full_info;
+static bool                    system_wide;
 static const char              *cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 
@@ -315,7 +317,7 @@ static bool sample_addr_correlates_sym(struct perf_event_attr *attr)
 
 static void print_sample_addr(union perf_event *event,
                          struct perf_sample *sample,
-                         struct perf_session *session,
+                         struct machine *machine,
                          struct thread *thread,
                          struct perf_event_attr *attr)
 {
@@ -328,11 +330,11 @@ static void print_sample_addr(union perf_event *event,
        if (!sample_addr_correlates_sym(attr))
                return;
 
-       thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-                             event->ip.pid, sample->addr, &al);
+       thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+                             sample->addr, &al);
        if (!al.map)
-               thread__find_addr_map(thread, session, cpumode, MAP__VARIABLE,
-                                     event->ip.pid, sample->addr, &al);
+               thread__find_addr_map(thread, machine, cpumode, MAP__VARIABLE,
+                                     sample->addr, &al);
 
        al.cpu = sample->cpu;
        al.sym = NULL;
@@ -362,7 +364,7 @@ static void print_sample_addr(union perf_event *event,
 static void process_event(union perf_event *event __unused,
                          struct perf_sample *sample,
                          struct perf_evsel *evsel,
-                         struct perf_session *session,
+                         struct machine *machine,
                          struct thread *thread)
 {
        struct perf_event_attr *attr = &evsel->attr;
@@ -377,15 +379,15 @@ static void process_event(union perf_event *event __unused,
                                  sample->raw_size);
 
        if (PRINT_FIELD(ADDR))
-               print_sample_addr(event, sample, session, thread, attr);
+               print_sample_addr(event, sample, machine, thread, attr);
 
        if (PRINT_FIELD(IP)) {
                if (!symbol_conf.use_callchain)
                        printf(" ");
                else
                        printf("\n");
-               perf_session__print_ip(event, sample, session,
-                                             PRINT_FIELD(SYM), PRINT_FIELD(DSO));
+               perf_event__print_ip(event, sample, machine, evsel,
+                                    PRINT_FIELD(SYM), PRINT_FIELD(DSO));
        }
 
        printf("\n");
@@ -432,14 +434,16 @@ static int cleanup_scripting(void)
        return scripting_ops->stop_script();
 }
 
-static char const              *input_name = "perf.data";
+static const char *input_name;
 
-static int process_sample_event(union perf_event *event,
+static int process_sample_event(struct perf_tool *tool __used,
+                               union perf_event *event,
                                struct perf_sample *sample,
                                struct perf_evsel *evsel,
-                               struct perf_session *session)
+                               struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(session, event->ip.pid);
+       struct addr_location al;
+       struct thread *thread = machine__findnew_thread(machine, event->ip.tid);
 
        if (thread == NULL) {
                pr_debug("problem processing %d event, skipping it.\n",
@@ -458,16 +462,25 @@ static int process_sample_event(union perf_event *event,
                return 0;
        }
 
+       if (perf_event__preprocess_sample(event, machine, &al, sample, 0) < 0) {
+               pr_err("problem processing %d event, skipping it.\n",
+                      event->header.type);
+               return -1;
+       }
+
+       if (al.filtered)
+               return 0;
+
        if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
                return 0;
 
-       scripting_ops->process_event(event, sample, evsel, session, thread);
+       scripting_ops->process_event(event, sample, evsel, machine, thread);
 
-       session->hists.stats.total_period += sample->period;
+       evsel->hists.stats.total_period += sample->period;
        return 0;
 }
 
-static struct perf_event_ops event_ops = {
+static struct perf_tool perf_script = {
        .sample          = process_sample_event,
        .mmap            = perf_event__process_mmap,
        .comm            = perf_event__process_comm,
@@ -494,7 +507,7 @@ static int __cmd_script(struct perf_session *session)
 
        signal(SIGINT, sig_handler);
 
-       ret = perf_session__process_events(session, &event_ops);
+       ret = perf_session__process_events(session, &perf_script);
 
        if (debug_mode)
                pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered);
@@ -523,12 +536,6 @@ static struct script_spec *script_spec__new(const char *spec,
        return s;
 }
 
-static void script_spec__delete(struct script_spec *s)
-{
-       free(s->spec);
-       free(s);
-}
-
 static void script_spec__add(struct script_spec *s)
 {
        list_add_tail(&s->node, &script_specs);
@@ -554,16 +561,11 @@ static struct script_spec *script_spec__findnew(const char *spec,
 
        s = script_spec__new(spec, ops);
        if (!s)
-               goto out_delete_spec;
+               return NULL;
 
        script_spec__add(s);
 
        return s;
-
-out_delete_spec:
-       script_spec__delete(s);
-
-       return NULL;
 }
 
 int script_spec_register(const char *spec, struct scripting_ops *ops)
@@ -681,7 +683,8 @@ static int parse_output_fields(const struct option *opt __used,
                        type = PERF_TYPE_RAW;
                else {
                        fprintf(stderr, "Invalid event type in field string.\n");
-                       return -EINVAL;
+                       rc = -EINVAL;
+                       goto out;
                }
 
                if (output[type].user_set)
@@ -923,6 +926,24 @@ static int read_script_info(struct script_desc *desc, const char *filename)
        return 0;
 }
 
+static char *get_script_root(struct dirent *script_dirent, const char *suffix)
+{
+       char *script_root, *str;
+
+       script_root = strdup(script_dirent->d_name);
+       if (!script_root)
+               return NULL;
+
+       str = (char *)ends_with(script_root, suffix);
+       if (!str) {
+               free(script_root);
+               return NULL;
+       }
+
+       *str = '\0';
+       return script_root;
+}
+
 static int list_available_scripts(const struct option *opt __used,
                                  const char *s __used, int unset __used)
 {
@@ -934,7 +955,6 @@ static int list_available_scripts(const struct option *opt __used,
        struct script_desc *desc;
        char first_half[BUFSIZ];
        char *script_root;
-       char *str;
 
        snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
 
@@ -950,16 +970,14 @@ static int list_available_scripts(const struct option *opt __used,
                        continue;
 
                for_each_script(lang_path, lang_dir, script_dirent, script_next) {
-                       script_root = strdup(script_dirent.d_name);
-                       str = (char *)ends_with(script_root, REPORT_SUFFIX);
-                       if (str) {
-                               *str = '\0';
+                       script_root = get_script_root(&script_dirent, REPORT_SUFFIX);
+                       if (script_root) {
                                desc = script_desc__findnew(script_root);
                                snprintf(script_path, MAXPATHLEN, "%s/%s",
                                         lang_path, script_dirent.d_name);
                                read_script_info(desc, script_path);
+                               free(script_root);
                        }
-                       free(script_root);
                }
        }
 
@@ -981,8 +999,7 @@ static char *get_script_path(const char *script_root, const char *suffix)
        char script_path[MAXPATHLEN];
        DIR *scripts_dir, *lang_dir;
        char lang_path[MAXPATHLEN];
-       char *str, *__script_root;
-       char *path = NULL;
+       char *__script_root;
 
        snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
 
@@ -998,23 +1015,18 @@ static char *get_script_path(const char *script_root, const char *suffix)
                        continue;
 
                for_each_script(lang_path, lang_dir, script_dirent, script_next) {
-                       __script_root = strdup(script_dirent.d_name);
-                       str = (char *)ends_with(__script_root, suffix);
-                       if (str) {
-                               *str = '\0';
-                               if (strcmp(__script_root, script_root))
-                                       continue;
+                       __script_root = get_script_root(&script_dirent, suffix);
+                       if (__script_root && !strcmp(script_root, __script_root)) {
+                               free(__script_root);
                                snprintf(script_path, MAXPATHLEN, "%s/%s",
                                         lang_path, script_dirent.d_name);
-                               path = strdup(script_path);
-                               free(__script_root);
-                               break;
+                               return strdup(script_path);
                        }
                        free(__script_root);
                }
        }
 
-       return path;
+       return NULL;
 }
 
 static bool is_top_script(const char *script_path)
@@ -1083,7 +1095,11 @@ static const struct option options[] = {
        OPT_CALLBACK('f', "fields", NULL, "str",
                     "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr",
                     parse_output_fields),
-       OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+                    "system-wide collection from all CPUs"),
+       OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+       OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+                  "only display events for these comms"),
        OPT_BOOLEAN('I', "show-info", &show_full_info,
                    "display extended information from perf.data file"),
        OPT_END()
@@ -1110,7 +1126,6 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
        struct perf_session *session;
        char *script_path = NULL;
        const char **__argv;
-       bool system_wide;
        int i, j, err;
 
        setup_scripting();
@@ -1178,15 +1193,17 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
                }
 
                if (!pid) {
-                       system_wide = true;
                        j = 0;
 
                        dup2(live_pipe[1], 1);
                        close(live_pipe[0]);
 
-                       if (!is_top_script(argv[0]))
+                       if (is_top_script(argv[0])) {
+                               system_wide = true;
+                       } else if (!system_wide) {
                                system_wide = !have_cmd(argc - rep_args,
                                                        &argv[rep_args]);
+                       }
 
                        __argv = malloc((argc + 6) * sizeof(const char *));
                        if (!__argv)
@@ -1234,10 +1251,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
                script_path = rep_script_path;
 
        if (script_path) {
-               system_wide = false;
                j = 0;
 
-               if (rec_script_path)
+               if (!rec_script_path)
+                       system_wide = false;
+               else if (!system_wide)
                        system_wide = !have_cmd(argc - 1, &argv[1]);
 
                __argv = malloc((argc + 2) * sizeof(const char *));
@@ -1261,7 +1279,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
        if (!script_name)
                setup_pager();
 
-       session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops);
+       session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_script);
        if (session == NULL)
                return -ENOMEM;
 
@@ -1287,7 +1305,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used)
                        return -1;
                }
 
-               input = open(input_name, O_RDONLY);
+               input = open(session->filename, O_RDONLY);      /* input_name */
                if (input < 0) {
                        perror("failed to open file");
                        exit(-1);
index 955930e..f5d2a63 100644 (file)
@@ -578,6 +578,33 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
                        avg / avg_stats(&walltime_nsecs_stats));
 }
 
+/* used for get_ratio_color() */
+enum grc_type {
+       GRC_STALLED_CYCLES_FE,
+       GRC_STALLED_CYCLES_BE,
+       GRC_CACHE_MISSES,
+       GRC_MAX_NR
+};
+
+static const char *get_ratio_color(enum grc_type type, double ratio)
+{
+       static const double grc_table[GRC_MAX_NR][3] = {
+               [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
+               [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
+               [GRC_CACHE_MISSES]      = { 20.0, 10.0, 5.0 },
+       };
+       const char *color = PERF_COLOR_NORMAL;
+
+       if (ratio > grc_table[type][0])
+               color = PERF_COLOR_RED;
+       else if (ratio > grc_table[type][1])
+               color = PERF_COLOR_MAGENTA;
+       else if (ratio > grc_table[type][2])
+               color = PERF_COLOR_YELLOW;
+
+       return color;
+}
+
 static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
 {
        double total, ratio = 0.0;
@@ -588,13 +615,7 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 50.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 30.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -611,13 +632,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 75.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 50.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 20.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -634,13 +649,7 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -657,13 +666,7 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -680,13 +683,7 @@ static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, dou
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -703,13 +700,7 @@ static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -726,13 +717,7 @@ static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -749,13 +734,7 @@ static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, doub
        if (total)
                ratio = avg / total * 100.0;
 
-       color = PERF_COLOR_NORMAL;
-       if (ratio > 20.0)
-               color = PERF_COLOR_RED;
-       else if (ratio > 10.0)
-               color = PERF_COLOR_MAGENTA;
-       else if (ratio > 5.0)
-               color = PERF_COLOR_YELLOW;
+       color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
        fprintf(output, " #  ");
        color_fprintf(output, color, "%6.2f%%", ratio);
@@ -1108,22 +1087,13 @@ static const struct option options[] = {
  */
 static int add_default_attributes(void)
 {
-       struct perf_evsel *pos;
-       size_t attr_nr = 0;
-       size_t c;
-
        /* Set attrs if no event is selected and !null_run: */
        if (null_run)
                return 0;
 
        if (!evsel_list->nr_entries) {
-               for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
-                       pos = perf_evsel__new(default_attrs + c, c + attr_nr);
-                       if (pos == NULL)
-                               return -1;
-                       perf_evlist__add(evsel_list, pos);
-               }
-               attr_nr += c;
+               if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0)
+                       return -1;
        }
 
        /* Detailed events get appended to the event list: */
@@ -1132,38 +1102,21 @@ static int add_default_attributes(void)
                return 0;
 
        /* Append detailed run extra attributes: */
-       for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
-               pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
-               if (pos == NULL)
-                       return -1;
-               perf_evlist__add(evsel_list, pos);
-       }
-       attr_nr += c;
+       if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0)
+               return -1;
 
        if (detailed_run < 2)
                return 0;
 
        /* Append very detailed run extra attributes: */
-       for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
-               pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
-               if (pos == NULL)
-                       return -1;
-               perf_evlist__add(evsel_list, pos);
-       }
+       if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0)
+               return -1;
 
        if (detailed_run < 3)
                return 0;
 
        /* Append very, very detailed run extra attributes: */
-       for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
-               pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
-               if (pos == NULL)
-                       return -1;
-               perf_evlist__add(evsel_list, pos);
-       }
-
-
-       return 0;
+       return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs);
 }
 
 int cmd_stat(int argc, const char **argv, const char *prefix __used)
@@ -1267,8 +1220,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 
        list_for_each_entry(pos, &evsel_list->entries, node) {
                if (perf_evsel__alloc_stat_priv(pos) < 0 ||
-                   perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
-                   perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
+                   perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0)
                        goto out_free_fd;
        }
 
index 831d1ba..2b9a7f4 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "util/cache.h"
 #include "util/debug.h"
+#include "util/debugfs.h"
 #include "util/evlist.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -14,8 +15,6 @@
 #include "util/thread_map.h"
 #include "../../include/linux/hw_breakpoint.h"
 
-static long page_size;
-
 static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym)
 {
        bool *visited = symbol__priv(sym);
@@ -31,6 +30,7 @@ static int test__vmlinux_matches_kallsyms(void)
        struct map *kallsyms_map, *vmlinux_map;
        struct machine kallsyms, vmlinux;
        enum map_type type = MAP__FUNCTION;
+       long page_size = sysconf(_SC_PAGE_SIZE);
        struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", };
 
        /*
@@ -247,7 +247,7 @@ static int trace_event__id(const char *evname)
 
        if (asprintf(&filename,
                     "%s/syscalls/%s/id",
-                    debugfs_path, evname) < 0)
+                    tracing_events_path, evname) < 0)
                return -1;
 
        fd = open(filename, O_RDONLY);
@@ -603,7 +603,7 @@ out_free_threads:
 
 #define TEST_ASSERT_VAL(text, cond) \
 do { \
-       if (!cond) { \
+       if (!(cond)) { \
                pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \
                return -1; \
        } \
@@ -759,6 +759,103 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist)
        return 0;
 }
 
+static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+       return test__checkevent_tracepoint(evlist);
+}
+
+static int
+test__checkevent_tracepoint_multi_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+
+       TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1);
+
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               TEST_ASSERT_VAL("wrong exclude_user",
+                               !evsel->attr.exclude_user);
+               TEST_ASSERT_VAL("wrong exclude_kernel",
+                               evsel->attr.exclude_kernel);
+               TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+               TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+       }
+
+       return test__checkevent_tracepoint_multi(evlist);
+}
+
+static int test__checkevent_raw_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
+
+       return test__checkevent_raw(evlist);
+}
+
+static int test__checkevent_numeric_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
+
+       return test__checkevent_numeric(evlist);
+}
+
+static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+       return test__checkevent_symbolic_name(evlist);
+}
+
+static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip);
+
+       return test__checkevent_symbolic_alias(evlist);
+}
+
+static int test__checkevent_genhw_modifier(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel = list_entry(evlist->entries.next,
+                                             struct perf_evsel, node);
+
+       TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user);
+       TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel);
+       TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv);
+       TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip);
+
+       return test__checkevent_genhw(evlist);
+}
+
 static struct test__event_st {
        const char *name;
        __u32 type;
@@ -808,6 +905,34 @@ static struct test__event_st {
                .name  = "mem:0:w",
                .check = test__checkevent_breakpoint_w,
        },
+       {
+               .name  = "syscalls:sys_enter_open:k",
+               .check = test__checkevent_tracepoint_modifier,
+       },
+       {
+               .name  = "syscalls:*:u",
+               .check = test__checkevent_tracepoint_multi_modifier,
+       },
+       {
+               .name  = "r1:kp",
+               .check = test__checkevent_raw_modifier,
+       },
+       {
+               .name  = "1:1:hp",
+               .check = test__checkevent_numeric_modifier,
+       },
+       {
+               .name  = "instructions:h",
+               .check = test__checkevent_symbolic_name_modifier,
+       },
+       {
+               .name  = "faults:u",
+               .check = test__checkevent_symbolic_alias_modifier,
+       },
+       {
+               .name  = "L1-dcache-load-miss:kp",
+               .check = test__checkevent_genhw_modifier,
+       },
 };
 
 #define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st))
@@ -841,6 +966,336 @@ static int test__parse_events(void)
 
        return ret;
 }
+
+static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t **maskp,
+                                        size_t *sizep)
+{
+       cpu_set_t *mask;
+       size_t size;
+       int i, cpu = -1, nrcpus = 1024;
+realloc:
+       mask = CPU_ALLOC(nrcpus);
+       size = CPU_ALLOC_SIZE(nrcpus);
+       CPU_ZERO_S(size, mask);
+
+       if (sched_getaffinity(pid, size, mask) == -1) {
+               CPU_FREE(mask);
+               if (errno == EINVAL && nrcpus < (1024 << 8)) {
+                       nrcpus = nrcpus << 2;
+                       goto realloc;
+               }
+               perror("sched_getaffinity");
+                       return -1;
+       }
+
+       for (i = 0; i < nrcpus; i++) {
+               if (CPU_ISSET_S(i, size, mask)) {
+                       if (cpu == -1) {
+                               cpu = i;
+                               *maskp = mask;
+                               *sizep = size;
+                       } else
+                               CPU_CLR_S(i, size, mask);
+               }
+       }
+
+       if (cpu == -1)
+               CPU_FREE(mask);
+
+       return cpu;
+}
+
+static int test__PERF_RECORD(void)
+{
+       struct perf_record_opts opts = {
+               .target_pid = -1,
+               .target_tid = -1,
+               .no_delay   = true,
+               .freq       = 10,
+               .mmap_pages = 256,
+               .sample_id_all_avail = true,
+       };
+       cpu_set_t *cpu_mask = NULL;
+       size_t cpu_mask_size = 0;
+       struct perf_evlist *evlist = perf_evlist__new(NULL, NULL);
+       struct perf_evsel *evsel;
+       struct perf_sample sample;
+       const char *cmd = "sleep";
+       const char *argv[] = { cmd, "1", NULL, };
+       char *bname;
+       u64 sample_type, prev_time = 0;
+       bool found_cmd_mmap = false,
+            found_libc_mmap = false,
+            found_vdso_mmap = false,
+            found_ld_mmap = false;
+       int err = -1, errs = 0, i, wakeups = 0, sample_size;
+       u32 cpu;
+       int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, };
+
+       if (evlist == NULL || argv == NULL) {
+               pr_debug("Not enough memory to create evlist\n");
+               goto out;
+       }
+
+       /*
+        * We need at least one evsel in the evlist, use the default
+        * one: "cycles".
+        */
+       err = perf_evlist__add_default(evlist);
+       if (err < 0) {
+               pr_debug("Not enough memory to create evsel\n");
+               goto out_delete_evlist;
+       }
+
+       /*
+        * Create maps of threads and cpus to monitor. In this case
+        * we start with all threads and cpus (-1, -1) but then in
+        * perf_evlist__prepare_workload we'll fill in the only thread
+        * we're monitoring, the one forked there.
+        */
+       err = perf_evlist__create_maps(evlist, opts.target_pid,
+                                      opts.target_tid, opts.cpu_list);
+       if (err < 0) {
+               pr_debug("Not enough memory to create thread/cpu maps\n");
+               goto out_delete_evlist;
+       }
+
+       /*
+        * Prepare the workload in argv[] to run, it'll fork it, and then wait
+        * for perf_evlist__start_workload() to exec it. This is done this way
+        * so that we have time to open the evlist (calling sys_perf_event_open
+        * on all the fds) and then mmap them.
+        */
+       err = perf_evlist__prepare_workload(evlist, &opts, argv);
+       if (err < 0) {
+               pr_debug("Couldn't run the workload!\n");
+               goto out_delete_evlist;
+       }
+
+       /*
+        * Config the evsels, setting attr->comm on the first one, etc.
+        */
+       evsel = list_entry(evlist->entries.next, struct perf_evsel, node);
+       evsel->attr.sample_type |= PERF_SAMPLE_CPU;
+       evsel->attr.sample_type |= PERF_SAMPLE_TID;
+       evsel->attr.sample_type |= PERF_SAMPLE_TIME;
+       perf_evlist__config_attrs(evlist, &opts);
+
+       err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask,
+                                           &cpu_mask_size);
+       if (err < 0) {
+               pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno));
+               goto out_delete_evlist;
+       }
+
+       cpu = err;
+
+       /*
+        * So that we can check perf_sample.cpu on all the samples.
+        */
+       if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, cpu_mask) < 0) {
+               pr_debug("sched_setaffinity: %s\n", strerror(errno));
+               goto out_free_cpu_mask;
+       }
+
+       /*
+        * Call sys_perf_event_open on all the fds on all the evsels,
+        * grouping them if asked to.
+        */
+       err = perf_evlist__open(evlist, opts.group);
+       if (err < 0) {
+               pr_debug("perf_evlist__open: %s\n", strerror(errno));
+               goto out_delete_evlist;
+       }
+
+       /*
+        * mmap the first fd on a given CPU and ask for events for the other
+        * fds in the same CPU to be injected in the same mmap ring buffer
+        * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)).
+        */
+       err = perf_evlist__mmap(evlist, opts.mmap_pages, false);
+       if (err < 0) {
+               pr_debug("perf_evlist__mmap: %s\n", strerror(errno));
+               goto out_delete_evlist;
+       }
+
+       /*
+        * We'll need these two to parse the PERF_SAMPLE_* fields in each
+        * event.
+        */
+       sample_type = perf_evlist__sample_type(evlist);
+       sample_size = __perf_evsel__sample_size(sample_type);
+
+       /*
+        * Now that all is properly set up, enable the events, they will
+        * count just on workload.pid, which will start...
+        */
+       perf_evlist__enable(evlist);
+
+       /*
+        * Now!
+        */
+       perf_evlist__start_workload(evlist);
+
+       while (1) {
+               int before = total_events;
+
+               for (i = 0; i < evlist->nr_mmaps; i++) {
+                       union perf_event *event;
+
+                       while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
+                               const u32 type = event->header.type;
+                               const char *name = perf_event__name(type);
+
+                               ++total_events;
+                               if (type < PERF_RECORD_MAX)
+                                       nr_events[type]++;
+
+                               err = perf_event__parse_sample(event, sample_type,
+                                                              sample_size, true,
+                                                              &sample, false);
+                               if (err < 0) {
+                                       if (verbose)
+                                               perf_event__fprintf(event, stderr);
+                                       pr_debug("Couldn't parse sample\n");
+                                       goto out_err;
+                               }
+
+                               if (verbose) {
+                                       pr_info("%" PRIu64" %d ", sample.time, sample.cpu);
+                                       perf_event__fprintf(event, stderr);
+                               }
+
+                               if (prev_time > sample.time) {
+                                       pr_debug("%s going backwards in time, prev=%" PRIu64 ", curr=%" PRIu64 "\n",
+                                                name, prev_time, sample.time);
+                                       ++errs;
+                               }
+
+                               prev_time = sample.time;
+
+                               if (sample.cpu != cpu) {
+                                       pr_debug("%s with unexpected cpu, expected %d, got %d\n",
+                                                name, cpu, sample.cpu);
+                                       ++errs;
+                               }
+
+                               if ((pid_t)sample.pid != evlist->workload.pid) {
+                                       pr_debug("%s with unexpected pid, expected %d, got %d\n",
+                                                name, evlist->workload.pid, sample.pid);
+                                       ++errs;
+                               }
+
+                               if ((pid_t)sample.tid != evlist->workload.pid) {
+                                       pr_debug("%s with unexpected tid, expected %d, got %d\n",
+                                                name, evlist->workload.pid, sample.tid);
+                                       ++errs;
+                               }
+
+                               if ((type == PERF_RECORD_COMM ||
+                                    type == PERF_RECORD_MMAP ||
+                                    type == PERF_RECORD_FORK ||
+                                    type == PERF_RECORD_EXIT) &&
+                                    (pid_t)event->comm.pid != evlist->workload.pid) {
+                                       pr_debug("%s with unexpected pid/tid\n", name);
+                                       ++errs;
+                               }
+
+                               if ((type == PERF_RECORD_COMM ||
+                                    type == PERF_RECORD_MMAP) &&
+                                    event->comm.pid != event->comm.tid) {
+                                       pr_debug("%s with different pid/tid!\n", name);
+                                       ++errs;
+                               }
+
+                               switch (type) {
+                               case PERF_RECORD_COMM:
+                                       if (strcmp(event->comm.comm, cmd)) {
+                                               pr_debug("%s with unexpected comm!\n", name);
+                                               ++errs;
+                                       }
+                                       break;
+                               case PERF_RECORD_EXIT:
+                                       goto found_exit;
+                               case PERF_RECORD_MMAP:
+                                       bname = strrchr(event->mmap.filename, '/');
+                                       if (bname != NULL) {
+                                               if (!found_cmd_mmap)
+                                                       found_cmd_mmap = !strcmp(bname + 1, cmd);
+                                               if (!found_libc_mmap)
+                                                       found_libc_mmap = !strncmp(bname + 1, "libc", 4);
+                                               if (!found_ld_mmap)
+                                                       found_ld_mmap = !strncmp(bname + 1, "ld", 2);
+                                       } else if (!found_vdso_mmap)
+                                               found_vdso_mmap = !strcmp(event->mmap.filename, "[vdso]");
+                                       break;
+
+                               case PERF_RECORD_SAMPLE:
+                                       /* Just ignore samples for now */
+                                       break;
+                               default:
+                                       pr_debug("Unexpected perf_event->header.type %d!\n",
+                                                type);
+                                       ++errs;
+                               }
+                       }
+               }
+
+               /*
+                * We don't use poll here because at least at 3.1 times the
+                * PERF_RECORD_{!SAMPLE} events don't honour
+                * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does.
+                */
+               if (total_events == before && false)
+                       poll(evlist->pollfd, evlist->nr_fds, -1);
+
+               sleep(1);
+               if (++wakeups > 5) {
+                       pr_debug("No PERF_RECORD_EXIT event!\n");
+                       break;
+               }
+       }
+
+found_exit:
+       if (nr_events[PERF_RECORD_COMM] > 1) {
+               pr_debug("Excessive number of PERF_RECORD_COMM events!\n");
+               ++errs;
+       }
+
+       if (nr_events[PERF_RECORD_COMM] == 0) {
+               pr_debug("Missing PERF_RECORD_COMM for %s!\n", cmd);
+               ++errs;
+       }
+
+       if (!found_cmd_mmap) {
+               pr_debug("PERF_RECORD_MMAP for %s missing!\n", cmd);
+               ++errs;
+       }
+
+       if (!found_libc_mmap) {
+               pr_debug("PERF_RECORD_MMAP for %s missing!\n", "libc");
+               ++errs;
+       }
+
+       if (!found_ld_mmap) {
+               pr_debug("PERF_RECORD_MMAP for %s missing!\n", "ld");
+               ++errs;
+       }
+
+       if (!found_vdso_mmap) {
+               pr_debug("PERF_RECORD_MMAP for %s missing!\n", "[vdso]");
+               ++errs;
+       }
+out_err:
+       perf_evlist__munmap(evlist);
+out_free_cpu_mask:
+       CPU_FREE(cpu_mask);
+out_delete_evlist:
+       perf_evlist__delete(evlist);
+out:
+       return (err < 0 || errs > 0) ? -1 : 0;
+}
+
 static struct test {
        const char *desc;
        int (*func)(void);
@@ -865,46 +1320,90 @@ static struct test {
                .desc = "parse events tests",
                .func = test__parse_events,
        },
+       {
+               .desc = "Validate PERF_RECORD_* events & perf_sample fields",
+               .func = test__PERF_RECORD,
+       },
        {
                .func = NULL,
        },
 };
 
-static int __cmd_test(void)
+static bool perf_test__matches(int curr, int argc, const char *argv[])
 {
-       int i = 0;
+       int i;
+
+       if (argc == 0)
+               return true;
 
-       page_size = sysconf(_SC_PAGE_SIZE);
+       for (i = 0; i < argc; ++i) {
+               char *end;
+               long nr = strtoul(argv[i], &end, 10);
+
+               if (*end == '\0') {
+                       if (nr == curr + 1)
+                               return true;
+                       continue;
+               }
+
+               if (strstr(tests[curr].desc, argv[i]))
+                       return true;
+       }
+
+       return false;
+}
+
+static int __cmd_test(int argc, const char *argv[])
+{
+       int i = 0;
 
        while (tests[i].func) {
-               int err;
-               pr_info("%2d: %s:", i + 1, tests[i].desc);
+               int curr = i++, err;
+
+               if (!perf_test__matches(curr, argc, argv))
+                       continue;
+
+               pr_info("%2d: %s:", i, tests[curr].desc);
                pr_debug("\n--- start ---\n");
-               err = tests[i].func();
-               pr_debug("---- end ----\n%s:", tests[i].desc);
+               err = tests[curr].func();
+               pr_debug("---- end ----\n%s:", tests[curr].desc);
                pr_info(" %s\n", err ? "FAILED!\n" : "Ok");
-               ++i;
        }
 
        return 0;
 }
 
-static const char * const test_usage[] = {
-       "perf test [<options>]",
-       NULL,
-};
+static int perf_test__list(int argc, const char **argv)
+{
+       int i = 0;
+
+       while (tests[i].func) {
+               int curr = i++;
 
-static const struct option test_options[] = {
+               if (argc > 1 && !strstr(tests[curr].desc, argv[1]))
+                       continue;
+
+               pr_info("%2d: %s\n", i, tests[curr].desc);
+       }
+
+       return 0;
+}
+
+int cmd_test(int argc, const char **argv, const char *prefix __used)
+{
+       const char * const test_usage[] = {
+       "perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
+       NULL,
+       };
+       const struct option test_options[] = {
        OPT_INTEGER('v', "verbose", &verbose,
                    "be more verbose (show symbol address, etc)"),
        OPT_END()
-};
+       };
 
-int cmd_test(int argc, const char **argv, const char *prefix __used)
-{
        argc = parse_options(argc, argv, test_options, test_usage, 0);
-       if (argc)
-               usage_with_options(test_usage, test_options);
+       if (argc >= 1 && !strcmp(argv[0], "list"))
+               return perf_test__list(argc, argv);
 
        symbol_conf.priv_size = sizeof(int);
        symbol_conf.sort_by_name = true;
@@ -915,5 +1414,5 @@ int cmd_test(int argc, const char **argv, const char *prefix __used)
 
        setup_pager();
 
-       return __cmd_test();
+       return __cmd_test(argc, argv);
 }
index aa26f4d..3b75b2e 100644 (file)
@@ -19,6 +19,7 @@
 #include "util/color.h"
 #include <linux/list.h>
 #include "util/cache.h"
+#include "util/evsel.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
 #include "util/callchain.h"
 #include "util/event.h"
 #include "util/session.h"
 #include "util/svghelper.h"
+#include "util/tool.h"
 
 #define SUPPORT_OLD_POWER_EVENTS 1
 #define PWR_EVENT_EXIT -1
 
 
-static char            const *input_name = "perf.data";
-static char            const *output_name = "output.svg";
+static const char      *input_name;
+static const char      *output_name = "output.svg";
 
 static unsigned int    numcpus;
 static u64             min_freq;       /* Lowest CPU frequency seen */
@@ -273,25 +275,28 @@ static int cpus_cstate_state[MAX_CPUS];
 static u64 cpus_pstate_start_times[MAX_CPUS];
 static u64 cpus_pstate_state[MAX_CPUS];
 
-static int process_comm_event(union perf_event *event,
+static int process_comm_event(struct perf_tool *tool __used,
+                             union perf_event *event,
                              struct perf_sample *sample __used,
-                             struct perf_session *session __used)
+                             struct machine *machine __used)
 {
        pid_set_comm(event->comm.tid, event->comm.comm);
        return 0;
 }
 
-static int process_fork_event(union perf_event *event,
+static int process_fork_event(struct perf_tool *tool __used,
+                             union perf_event *event,
                              struct perf_sample *sample __used,
-                             struct perf_session *session __used)
+                             struct machine *machine __used)
 {
        pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
        return 0;
 }
 
-static int process_exit_event(union perf_event *event,
+static int process_exit_event(struct perf_tool *tool __used,
+                             union perf_event *event,
                              struct perf_sample *sample __used,
-                             struct perf_session *session __used)
+                             struct machine *machine __used)
 {
        pid_exit(event->fork.pid, event->fork.time);
        return 0;
@@ -486,14 +491,15 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
 }
 
 
-static int process_sample_event(union perf_event *event __used,
+static int process_sample_event(struct perf_tool *tool __used,
+                               union perf_event *event __used,
                                struct perf_sample *sample,
-                               struct perf_evsel *evsel __used,
-                               struct perf_session *session)
+                               struct perf_evsel *evsel,
+                               struct machine *machine __used)
 {
        struct trace_entry *te;
 
-       if (session->sample_type & PERF_SAMPLE_TIME) {
+       if (evsel->attr.sample_type & PERF_SAMPLE_TIME) {
                if (!first_time || first_time > sample->time)
                        first_time = sample->time;
                if (last_time < sample->time)
@@ -501,7 +507,7 @@ static int process_sample_event(union perf_event *event __used,
        }
 
        te = (void *)sample->raw_data;
-       if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) {
+       if ((evsel->attr.sample_type & PERF_SAMPLE_RAW) && sample->raw_size > 0) {
                char *event_str;
 #ifdef SUPPORT_OLD_POWER_EVENTS
                struct power_entry_old *peo;
@@ -974,7 +980,7 @@ static void write_svg_file(const char *filename)
        svg_close();
 }
 
-static struct perf_event_ops event_ops = {
+static struct perf_tool perf_timechart = {
        .comm                   = process_comm_event,
        .fork                   = process_fork_event,
        .exit                   = process_exit_event,
@@ -985,7 +991,7 @@ static struct perf_event_ops event_ops = {
 static int __cmd_timechart(void)
 {
        struct perf_session *session = perf_session__new(input_name, O_RDONLY,
-                                                        0, false, &event_ops);
+                                                        0, false, &perf_timechart);
        int ret = -EINVAL;
 
        if (session == NULL)
@@ -994,7 +1000,7 @@ static int __cmd_timechart(void)
        if (!perf_session__has_traces(session, "timechart record"))
                goto out_delete;
 
-       ret = perf_session__process_events(session, &event_ops);
+       ret = perf_session__process_events(session, &perf_timechart);
        if (ret)
                goto out_delete;
 
index c9cdedb..4f81eeb 100644 (file)
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static struct perf_top top = {
-       .count_filter           = 5,
-       .delay_secs             = 2,
-       .target_pid             = -1,
-       .target_tid             = -1,
-       .freq                   = 1000, /* 1 KHz */
-};
-
-static bool                    system_wide                     =  false;
-
-static bool                    use_tui, use_stdio;
-
-static bool                    sort_has_symbols;
-
-static bool                    dont_use_callchains;
-static char                    callchain_default_opt[]         = "fractal,0.5,callee";
-
-
-static int                     default_interval                =      0;
-
-static bool                    kptr_restrict_warned;
-static bool                    vmlinux_warned;
-static bool                    inherit                         =  false;
-static int                     realtime_prio                   =      0;
-static bool                    group                           =  false;
-static bool                    sample_id_all_avail             =   true;
-static unsigned int            mmap_pages                      =    128;
-
-static bool                    dump_symtab                     =  false;
-
-static struct winsize          winsize;
-
-static const char              *sym_filter                     =   NULL;
-static int                     sym_pcnt_filter                 =      5;
-
-/*
- * Source functions
- */
 
 void get_term_dimensions(struct winsize *ws)
 {
@@ -125,21 +87,23 @@ void get_term_dimensions(struct winsize *ws)
        ws->ws_col = 80;
 }
 
-static void update_print_entries(struct winsize *ws)
+static void perf_top__update_print_entries(struct perf_top *top)
 {
-       top.print_entries = ws->ws_row;
+       top->print_entries = top->winsize.ws_row;
 
-       if (top.print_entries > 9)
-               top.print_entries -= 9;
+       if (top->print_entries > 9)
+               top->print_entries -= 9;
 }
 
-static void sig_winch_handler(int sig __used)
+static void perf_top__sig_winch(int sig __used, siginfo_t *info __used, void *arg)
 {
-       get_term_dimensions(&winsize);
-       update_print_entries(&winsize);
+       struct perf_top *top = arg;
+
+       get_term_dimensions(&top->winsize);
+       perf_top__update_print_entries(top);
 }
 
-static int parse_source(struct hist_entry *he)
+static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
 {
        struct symbol *sym;
        struct annotation *notes;
@@ -170,7 +134,7 @@ static int parse_source(struct hist_entry *he)
 
        pthread_mutex_lock(&notes->lock);
 
-       if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
+       if (symbol__alloc_hist(sym) < 0) {
                pthread_mutex_unlock(&notes->lock);
                pr_err("Not enough memory for annotating '%s' symbol!\n",
                       sym->name);
@@ -181,7 +145,7 @@ static int parse_source(struct hist_entry *he)
        err = symbol__annotate(sym, map, 0);
        if (err == 0) {
 out_assign:
-               top.sym_filter_entry = he;
+               top->sym_filter_entry = he;
        }
 
        pthread_mutex_unlock(&notes->lock);
@@ -194,14 +158,16 @@ static void __zero_source_counters(struct hist_entry *he)
        symbol__annotate_zero_histograms(sym);
 }
 
-static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
+static void perf_top__record_precise_ip(struct perf_top *top,
+                                       struct hist_entry *he,
+                                       int counter, u64 ip)
 {
        struct annotation *notes;
        struct symbol *sym;
 
        if (he == NULL || he->ms.sym == NULL ||
-           ((top.sym_filter_entry == NULL ||
-             top.sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1))
+           ((top->sym_filter_entry == NULL ||
+             top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1))
                return;
 
        sym = he->ms.sym;
@@ -210,8 +176,7 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
        if (pthread_mutex_trylock(&notes->lock))
                return;
 
-       if (notes->src == NULL &&
-           symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) {
+       if (notes->src == NULL && symbol__alloc_hist(sym) < 0) {
                pthread_mutex_unlock(&notes->lock);
                pr_err("Not enough memory for annotating '%s' symbol!\n",
                       sym->name);
@@ -225,8 +190,9 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip)
        pthread_mutex_unlock(&notes->lock);
 }
 
-static void show_details(struct hist_entry *he)
+static void perf_top__show_details(struct perf_top *top)
 {
+       struct hist_entry *he = top->sym_filter_entry;
        struct annotation *notes;
        struct symbol *symbol;
        int more;
@@ -242,15 +208,15 @@ static void show_details(struct hist_entry *he)
        if (notes->src == NULL)
                goto out_unlock;
 
-       printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name);
-       printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
+       printf("Showing %s for %s\n", event_name(top->sym_evsel), symbol->name);
+       printf("  Events  Pcnt (>=%d%%)\n", top->sym_pcnt_filter);
 
-       more = symbol__annotate_printf(symbol, he->ms.map, top.sym_evsel->idx,
-                                      0, sym_pcnt_filter, top.print_entries, 4);
-       if (top.zero)
-               symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx);
+       more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel->idx,
+                                      0, top->sym_pcnt_filter, top->print_entries, 4);
+       if (top->zero)
+               symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx);
        else
-               symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx);
+               symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx);
        if (more != 0)
                printf("%d lines not displayed, maybe increase display entries [e]\n", more);
 out_unlock:
@@ -259,11 +225,9 @@ out_unlock:
 
 static const char              CONSOLE_CLEAR[] = "\e[H\e[2J";
 
-static struct hist_entry *
-       perf_session__add_hist_entry(struct perf_session *session,
-                                    struct addr_location *al,
-                                    struct perf_sample *sample,
-                                    struct perf_evsel *evsel)
+static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel,
+                                                    struct addr_location *al,
+                                                    struct perf_sample *sample)
 {
        struct hist_entry *he;
 
@@ -271,50 +235,51 @@ static struct hist_entry *
        if (he == NULL)
                return NULL;
 
-       session->hists.stats.total_period += sample->period;
+       evsel->hists.stats.total_period += sample->period;
        hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
        return he;
 }
 
-static void print_sym_table(void)
+static void perf_top__print_sym_table(struct perf_top *top)
 {
        char bf[160];
        int printed = 0;
-       const int win_width = winsize.ws_col - 1;
+       const int win_width = top->winsize.ws_col - 1;
 
        puts(CONSOLE_CLEAR);
 
-       perf_top__header_snprintf(&top, bf, sizeof(bf));
+       perf_top__header_snprintf(top, bf, sizeof(bf));
        printf("%s\n", bf);
 
-       perf_top__reset_sample_counters(&top);
+       perf_top__reset_sample_counters(top);
 
        printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
-       if (top.sym_evsel->hists.stats.nr_lost_warned !=
-           top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) {
-               top.sym_evsel->hists.stats.nr_lost_warned =
-                       top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST];
+       if (top->sym_evsel->hists.stats.nr_lost_warned !=
+           top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) {
+               top->sym_evsel->hists.stats.nr_lost_warned =
+                       top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST];
                color_fprintf(stdout, PERF_COLOR_RED,
                              "WARNING: LOST %d chunks, Check IO/CPU overload",
-                             top.sym_evsel->hists.stats.nr_lost_warned);
+                             top->sym_evsel->hists.stats.nr_lost_warned);
                ++printed;
        }
 
-       if (top.sym_filter_entry) {
-               show_details(top.sym_filter_entry);
+       if (top->sym_filter_entry) {
+               perf_top__show_details(top);
                return;
        }
 
-       hists__collapse_resort_threaded(&top.sym_evsel->hists);
-       hists__output_resort_threaded(&top.sym_evsel->hists);
-       hists__decay_entries_threaded(&top.sym_evsel->hists,
-                                     top.hide_user_symbols,
-                                     top.hide_kernel_symbols);
-       hists__output_recalc_col_len(&top.sym_evsel->hists, winsize.ws_row - 3);
+       hists__collapse_resort_threaded(&top->sym_evsel->hists);
+       hists__output_resort_threaded(&top->sym_evsel->hists);
+       hists__decay_entries_threaded(&top->sym_evsel->hists,
+                                     top->hide_user_symbols,
+                                     top->hide_kernel_symbols);
+       hists__output_recalc_col_len(&top->sym_evsel->hists,
+                                    top->winsize.ws_row - 3);
        putchar('\n');
-       hists__fprintf(&top.sym_evsel->hists, NULL, false, false,
-                      winsize.ws_row - 4 - printed, win_width, stdout);
+       hists__fprintf(&top->sym_evsel->hists, NULL, false, false,
+                      top->winsize.ws_row - 4 - printed, win_width, stdout);
 }
 
 static void prompt_integer(int *target, const char *msg)
@@ -352,17 +317,17 @@ static void prompt_percent(int *target, const char *msg)
                *target = tmp;
 }
 
-static void prompt_symbol(struct hist_entry **target, const char *msg)
+static void perf_top__prompt_symbol(struct perf_top *top, const char *msg)
 {
        char *buf = malloc(0), *p;
-       struct hist_entry *syme = *target, *n, *found = NULL;
+       struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL;
        struct rb_node *next;
        size_t dummy = 0;
 
        /* zero counters of active symbol */
        if (syme) {
                __zero_source_counters(syme);
-               *target = NULL;
+               top->sym_filter_entry = NULL;
        }
 
        fprintf(stdout, "\n%s: ", msg);
@@ -373,7 +338,7 @@ static void prompt_symbol(struct hist_entry **target, const char *msg)
        if (p)
                *p = 0;
 
-       next = rb_first(&top.sym_evsel->hists.entries);
+       next = rb_first(&top->sym_evsel->hists.entries);
        while (next) {
                n = rb_entry(next, struct hist_entry, rb_node);
                if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) {
@@ -386,47 +351,46 @@ static void prompt_symbol(struct hist_entry **target, const char *msg)
        if (!found) {
                fprintf(stderr, "Sorry, %s is not active.\n", buf);
                sleep(1);
-               return;
        } else
-               parse_source(found);
+               perf_top__parse_source(top, found);
 
 out_free:
        free(buf);
 }
 
-static void print_mapped_keys(void)
+static void perf_top__print_mapped_keys(struct perf_top *top)
 {
        char *name = NULL;
 
-       if (top.sym_filter_entry) {
-               struct symbol *sym = top.sym_filter_entry->ms.sym;
+       if (top->sym_filter_entry) {
+               struct symbol *sym = top->sym_filter_entry->ms.sym;
                name = sym->name;
        }
 
        fprintf(stdout, "\nMapped keys:\n");
-       fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", top.delay_secs);
-       fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", top.print_entries);
+       fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", top->delay_secs);
+       fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", top->print_entries);
 
-       if (top.evlist->nr_entries > 1)
-               fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(top.sym_evsel));
+       if (top->evlist->nr_entries > 1)
+               fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(top->sym_evsel));
 
-       fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", top.count_filter);
+       fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", top->count_filter);
 
-       fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
+       fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", top->sym_pcnt_filter);
        fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
        fprintf(stdout, "\t[S]     stop annotation.\n");
 
        fprintf(stdout,
                "\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
-               top.hide_kernel_symbols ? "yes" : "no");
+               top->hide_kernel_symbols ? "yes" : "no");
        fprintf(stdout,
                "\t[U]     hide user symbols.               \t(%s)\n",
-               top.hide_user_symbols ? "yes" : "no");
-       fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", top.zero ? 1 : 0);
+               top->hide_user_symbols ? "yes" : "no");
+       fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", top->zero ? 1 : 0);
        fprintf(stdout, "\t[qQ]    quit.\n");
 }
 
-static int key_mapped(int c)
+static int perf_top__key_mapped(struct perf_top *top, int c)
 {
        switch (c) {
                case 'd':
@@ -442,7 +406,7 @@ static int key_mapped(int c)
                case 'S':
                        return 1;
                case 'E':
-                       return top.evlist->nr_entries > 1 ? 1 : 0;
+                       return top->evlist->nr_entries > 1 ? 1 : 0;
                default:
                        break;
        }
@@ -450,13 +414,13 @@ static int key_mapped(int c)
        return 0;
 }
 
-static void handle_keypress(int c)
+static void perf_top__handle_keypress(struct perf_top *top, int c)
 {
-       if (!key_mapped(c)) {
+       if (!perf_top__key_mapped(top, c)) {
                struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
                struct termios tc, save;
 
-               print_mapped_keys();
+               perf_top__print_mapped_keys(top);
                fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
                fflush(stdout);
 
@@ -471,81 +435,86 @@ static void handle_keypress(int c)
                c = getc(stdin);
 
                tcsetattr(0, TCSAFLUSH, &save);
-               if (!key_mapped(c))
+               if (!perf_top__key_mapped(top, c))
                        return;
        }
 
        switch (c) {
                case 'd':
-                       prompt_integer(&top.delay_secs, "Enter display delay");
-                       if (top.delay_secs < 1)
-                               top.delay_secs = 1;
+                       prompt_integer(&top->delay_secs, "Enter display delay");
+                       if (top->delay_secs < 1)
+                               top->delay_secs = 1;
                        break;
                case 'e':
-                       prompt_integer(&top.print_entries, "Enter display entries (lines)");
-                       if (top.print_entries == 0) {
-                               sig_winch_handler(SIGWINCH);
-                               signal(SIGWINCH, sig_winch_handler);
+                       prompt_integer(&top->print_entries, "Enter display entries (lines)");
+                       if (top->print_entries == 0) {
+                               struct sigaction act = {
+                                       .sa_sigaction = perf_top__sig_winch,
+                                       .sa_flags     = SA_SIGINFO,
+                               };
+                               perf_top__sig_winch(SIGWINCH, NULL, top);
+                               sigaction(SIGWINCH, &act, NULL);
                        } else
                                signal(SIGWINCH, SIG_DFL);
                        break;
                case 'E':
-                       if (top.evlist->nr_entries > 1) {
+                       if (top->evlist->nr_entries > 1) {
                                /* Select 0 as the default event: */
                                int counter = 0;
 
                                fprintf(stderr, "\nAvailable events:");
 
-                               list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
-                                       fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel));
+                               list_for_each_entry(top->sym_evsel, &top->evlist->entries, node)
+                                       fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, event_name(top->sym_evsel));
 
                                prompt_integer(&counter, "Enter details event counter");
 
-                               if (counter >= top.evlist->nr_entries) {
-                                       top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
-                                       fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel));
+                               if (counter >= top->evlist->nr_entries) {
+                                       top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
+                                       fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top->sym_evsel));
                                        sleep(1);
                                        break;
                                }
-                               list_for_each_entry(top.sym_evsel, &top.evlist->entries, node)
-                                       if (top.sym_evsel->idx == counter)
+                               list_for_each_entry(top->sym_evsel, &top->evlist->entries, node)
+                                       if (top->sym_evsel->idx == counter)
                                                break;
                        } else
-                               top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
+                               top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node);
                        break;
                case 'f':
-                       prompt_integer(&top.count_filter, "Enter display event count filter");
+                       prompt_integer(&top->count_filter, "Enter display event count filter");
                        break;
                case 'F':
-                       prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
+                       prompt_percent(&top->sym_pcnt_filter,
+                                      "Enter details display event filter (percent)");
                        break;
                case 'K':
-                       top.hide_kernel_symbols = !top.hide_kernel_symbols;
+                       top->hide_kernel_symbols = !top->hide_kernel_symbols;
                        break;
                case 'q':
                case 'Q':
                        printf("exiting.\n");
-                       if (dump_symtab)
-                               perf_session__fprintf_dsos(top.session, stderr);
+                       if (top->dump_symtab)
+                               perf_session__fprintf_dsos(top->session, stderr);
                        exit(0);
                case 's':
-                       prompt_symbol(&top.sym_filter_entry, "Enter details symbol");
+                       perf_top__prompt_symbol(top, "Enter details symbol");
                        break;
                case 'S':
-                       if (!top.sym_filter_entry)
+                       if (!top->sym_filter_entry)
                                break;
                        else {
-                               struct hist_entry *syme = top.sym_filter_entry;
+                               struct hist_entry *syme = top->sym_filter_entry;
 
-                               top.sym_filter_entry = NULL;
+                               top->sym_filter_entry = NULL;
                                __zero_source_counters(syme);
                        }
                        break;
                case 'U':
-                       top.hide_user_symbols = !top.hide_user_symbols;
+                       top->hide_user_symbols = !top->hide_user_symbols;
                        break;
                case 'z':
-                       top.zero = !top.zero;
+                       top->zero = !top->zero;
                        break;
                default:
                        break;
@@ -563,28 +532,30 @@ static void perf_top__sort_new_samples(void *arg)
        hists__collapse_resort_threaded(&t->sym_evsel->hists);
        hists__output_resort_threaded(&t->sym_evsel->hists);
        hists__decay_entries_threaded(&t->sym_evsel->hists,
-                                     top.hide_user_symbols,
-                                     top.hide_kernel_symbols);
+                                     t->hide_user_symbols,
+                                     t->hide_kernel_symbols);
 }
 
-static void *display_thread_tui(void *arg __used)
+static void *display_thread_tui(void *arg)
 {
+       struct perf_top *top = arg;
        const char *help = "For a higher level overview, try: perf top --sort comm,dso";
 
-       perf_top__sort_new_samples(&top);
-       perf_evlist__tui_browse_hists(top.evlist, help,
+       perf_top__sort_new_samples(top);
+       perf_evlist__tui_browse_hists(top->evlist, help,
                                      perf_top__sort_new_samples,
-                                     &top, top.delay_secs);
+                                     top, top->delay_secs);
 
        exit_browser(0);
        exit(0);
        return NULL;
 }
 
-static void *display_thread(void *arg __used)
+static void *display_thread(void *arg)
 {
        struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
        struct termios tc, save;
+       struct perf_top *top = arg;
        int delay_msecs, c;
 
        tcgetattr(0, &save);
@@ -595,13 +566,13 @@ static void *display_thread(void *arg __used)
 
        pthread__unblock_sigwinch();
 repeat:
-       delay_msecs = top.delay_secs * 1000;
+       delay_msecs = top->delay_secs * 1000;
        tcsetattr(0, TCSANOW, &tc);
        /* trash return*/
        getc(stdin);
 
        while (1) {
-               print_sym_table();
+               perf_top__print_sym_table(top);
                /*
                 * Either timeout expired or we got an EINTR due to SIGWINCH,
                 * refresh screen in both cases.
@@ -621,7 +592,7 @@ process_hotkey:
        c = getc(stdin);
        tcsetattr(0, TCSAFLUSH, &save);
 
-       handle_keypress(c);
+       perf_top__handle_keypress(top, c);
        goto repeat;
 
        return NULL;
@@ -673,47 +644,17 @@ static int symbol_filter(struct map *map __used, struct symbol *sym)
        return 0;
 }
 
-static void perf_event__process_sample(const union perf_event *event,
+static void perf_event__process_sample(struct perf_tool *tool,
+                                      const union perf_event *event,
                                       struct perf_evsel *evsel,
                                       struct perf_sample *sample,
-                                      struct perf_session *session)
+                                      struct machine *machine)
 {
+       struct perf_top *top = container_of(tool, struct perf_top, tool);
        struct symbol *parent = NULL;
        u64 ip = event->ip.ip;
        struct addr_location al;
-       struct machine *machine;
        int err;
-       u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-
-       ++top.samples;
-
-       switch (origin) {
-       case PERF_RECORD_MISC_USER:
-               ++top.us_samples;
-               if (top.hide_user_symbols)
-                       return;
-               machine = perf_session__find_host_machine(session);
-               break;
-       case PERF_RECORD_MISC_KERNEL:
-               ++top.kernel_samples;
-               if (top.hide_kernel_symbols)
-                       return;
-               machine = perf_session__find_host_machine(session);
-               break;
-       case PERF_RECORD_MISC_GUEST_KERNEL:
-               ++top.guest_kernel_samples;
-               machine = perf_session__find_machine(session, event->ip.pid);
-               break;
-       case PERF_RECORD_MISC_GUEST_USER:
-               ++top.guest_us_samples;
-               /*
-                * TODO: we don't process guest user from host side
-                * except simple counting.
-                */
-               return;
-       default:
-               return;
-       }
 
        if (!machine && perf_guest) {
                pr_err("Can't find guest [%d]'s kernel information\n",
@@ -722,14 +663,14 @@ static void perf_event__process_sample(const union perf_event *event,
        }
 
        if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
-               top.exact_samples++;
+               top->exact_samples++;
 
-       if (perf_event__preprocess_sample(event, session, &al, sample,
+       if (perf_event__preprocess_sample(event, machine, &al, sample,
                                          symbol_filter) < 0 ||
            al.filtered)
                return;
 
-       if (!kptr_restrict_warned &&
+       if (!top->kptr_restrict_warned &&
            symbol_conf.kptr_restrict &&
            al.cpumode == PERF_RECORD_MISC_KERNEL) {
                ui__warning(
@@ -740,7 +681,7 @@ static void perf_event__process_sample(const union perf_event *event,
                          " modules" : "");
                if (use_browser <= 0)
                        sleep(5);
-               kptr_restrict_warned = true;
+               top->kptr_restrict_warned = true;
        }
 
        if (al.sym == NULL) {
@@ -756,7 +697,7 @@ static void perf_event__process_sample(const union perf_event *event,
                 * --hide-kernel-symbols, even if the user specifies an
                 * invalid --vmlinux ;-)
                 */
-               if (!kptr_restrict_warned && !vmlinux_warned &&
+               if (!top->kptr_restrict_warned && !top->vmlinux_warned &&
                    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
                    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
                        if (symbol_conf.vmlinux_name) {
@@ -769,7 +710,7 @@ static void perf_event__process_sample(const union perf_event *event,
 
                        if (use_browser <= 0)
                                sleep(5);
-                       vmlinux_warned = true;
+                       top->vmlinux_warned = true;
                }
        }
 
@@ -778,70 +719,109 @@ static void perf_event__process_sample(const union perf_event *event,
 
                if ((sort__has_parent || symbol_conf.use_callchain) &&
                    sample->callchain) {
-                       err = perf_session__resolve_callchain(session, al.thread,
-                                                             sample->callchain, &parent);
+                       err = machine__resolve_callchain(machine, evsel, al.thread,
+                                                        sample->callchain, &parent);
                        if (err)
                                return;
                }
 
-               he = perf_session__add_hist_entry(session, &al, sample, evsel);
+               he = perf_evsel__add_hist_entry(evsel, &al, sample);
                if (he == NULL) {
                        pr_err("Problem incrementing symbol period, skipping event\n");
                        return;
                }
 
                if (symbol_conf.use_callchain) {
-                       err = callchain_append(he->callchain, &session->callchain_cursor,
+                       err = callchain_append(he->callchain, &evsel->hists.callchain_cursor,
                                               sample->period);
                        if (err)
                                return;
                }
 
-               if (sort_has_symbols)
-                       record_precise_ip(he, evsel->idx, ip);
+               if (top->sort_has_symbols)
+                       perf_top__record_precise_ip(top, he, evsel->idx, ip);
        }
 
        return;
 }
 
-static void perf_session__mmap_read_idx(struct perf_session *self, int idx)
+static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 {
        struct perf_sample sample;
        struct perf_evsel *evsel;
+       struct perf_session *session = top->session;
        union perf_event *event;
+       struct machine *machine;
+       u8 origin;
        int ret;
 
-       while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) {
-               ret = perf_session__parse_sample(self, event, &sample);
+       while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) {
+               ret = perf_session__parse_sample(session, event, &sample);
                if (ret) {
                        pr_err("Can't parse sample, err = %d\n", ret);
                        continue;
                }
 
-               evsel = perf_evlist__id2evsel(self->evlist, sample.id);
+               evsel = perf_evlist__id2evsel(session->evlist, sample.id);
                assert(evsel != NULL);
 
+               origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
                if (event->header.type == PERF_RECORD_SAMPLE)
-                       perf_event__process_sample(event, evsel, &sample, self);
-               else if (event->header.type < PERF_RECORD_MAX) {
+                       ++top->samples;
+
+               switch (origin) {
+               case PERF_RECORD_MISC_USER:
+                       ++top->us_samples;
+                       if (top->hide_user_symbols)
+                               continue;
+                       machine = perf_session__find_host_machine(session);
+                       break;
+               case PERF_RECORD_MISC_KERNEL:
+                       ++top->kernel_samples;
+                       if (top->hide_kernel_symbols)
+                               continue;
+                       machine = perf_session__find_host_machine(session);
+                       break;
+               case PERF_RECORD_MISC_GUEST_KERNEL:
+                       ++top->guest_kernel_samples;
+                       machine = perf_session__find_machine(session, event->ip.pid);
+                       break;
+               case PERF_RECORD_MISC_GUEST_USER:
+                       ++top->guest_us_samples;
+                       /*
+                        * TODO: we don't process guest user from host side
+                        * except simple counting.
+                        */
+                       /* Fall thru */
+               default:
+                       continue;
+               }
+
+
+               if (event->header.type == PERF_RECORD_SAMPLE) {
+                       perf_event__process_sample(&top->tool, event, evsel,
+                                                  &sample, machine);
+               } else if (event->header.type < PERF_RECORD_MAX) {
                        hists__inc_nr_events(&evsel->hists, event->header.type);
-                       perf_event__process(event, &sample, self);
+                       perf_event__process(&top->tool, event, &sample, machine);
                } else
-                       ++self->hists.stats.nr_unknown_events;
+                       ++session->hists.stats.nr_unknown_events;
        }
 }
 
-static void perf_session__mmap_read(struct perf_session *self)
+static void perf_top__mmap_read(struct perf_top *top)
 {
        int i;
 
-       for (i = 0; i < top.evlist->nr_mmaps; i++)
-               perf_session__mmap_read_idx(self, i);
+       for (i = 0; i < top->evlist->nr_mmaps; i++)
+               perf_top__mmap_read_idx(top, i);
 }
 
-static void start_counters(struct perf_evlist *evlist)
+static void perf_top__start_counters(struct perf_top *top)
 {
        struct perf_evsel *counter, *first;
+       struct perf_evlist *evlist = top->evlist;
 
        first = list_entry(evlist->entries.next, struct perf_evsel, node);
 
@@ -849,15 +829,15 @@ static void start_counters(struct perf_evlist *evlist)
                struct perf_event_attr *attr = &counter->attr;
                struct xyarray *group_fd = NULL;
 
-               if (group && counter != first)
+               if (top->group && counter != first)
                        group_fd = first->fd;
 
                attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 
-               if (top.freq) {
+               if (top->freq) {
                        attr->sample_type |= PERF_SAMPLE_PERIOD;
                        attr->freq        = 1;
-                       attr->sample_freq = top.freq;
+                       attr->sample_freq = top->freq;
                }
 
                if (evlist->nr_entries > 1) {
@@ -870,23 +850,23 @@ static void start_counters(struct perf_evlist *evlist)
 
                attr->mmap = 1;
                attr->comm = 1;
-               attr->inherit = inherit;
+               attr->inherit = top->inherit;
 retry_sample_id:
-               attr->sample_id_all = sample_id_all_avail ? 1 : 0;
+               attr->sample_id_all = top->sample_id_all_avail ? 1 : 0;
 try_again:
-               if (perf_evsel__open(counter, top.evlist->cpus,
-                                    top.evlist->threads, group,
+               if (perf_evsel__open(counter, top->evlist->cpus,
+                                    top->evlist->threads, top->group,
                                     group_fd) < 0) {
                        int err = errno;
 
                        if (err == EPERM || err == EACCES) {
                                ui__error_paranoid();
                                goto out_err;
-                       } else if (err == EINVAL && sample_id_all_avail) {
+                       } else if (err == EINVAL && top->sample_id_all_avail) {
                                /*
                                 * Old kernel, no attr->sample_id_type_all field
                                 */
-                               sample_id_all_avail = false;
+                               top->sample_id_all_avail = false;
                                goto retry_sample_id;
                        }
                        /*
@@ -920,7 +900,7 @@ try_again:
                }
        }
 
-       if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) {
+       if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) {
                ui__warning("Failed to mmap with %d (%s)\n",
                            errno, strerror(errno));
                goto out_err;
@@ -933,14 +913,14 @@ out_err:
        exit(0);
 }
 
-static int setup_sample_type(void)
+static int perf_top__setup_sample_type(struct perf_top *top)
 {
-       if (!sort_has_symbols) {
+       if (!top->sort_has_symbols) {
                if (symbol_conf.use_callchain) {
                        ui__warning("Selected -g but \"sym\" not present in --sort/-s.");
                        return -EINVAL;
                }
-       } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
+       } else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) {
                if (callchain_register_param(&callchain_param) < 0) {
                        ui__warning("Can't register callchain params.\n");
                        return -EINVAL;
@@ -950,7 +930,7 @@ static int setup_sample_type(void)
        return 0;
 }
 
-static int __cmd_top(void)
+static int __cmd_top(struct perf_top *top)
 {
        pthread_t thread;
        int ret;
@@ -958,39 +938,40 @@ static int __cmd_top(void)
         * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
         * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
         */
-       top.session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
-       if (top.session == NULL)
+       top->session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
+       if (top->session == NULL)
                return -ENOMEM;
 
-       ret = setup_sample_type();
+       ret = perf_top__setup_sample_type(top);
        if (ret)
                goto out_delete;
 
-       if (top.target_tid != -1)
-               perf_event__synthesize_thread_map(top.evlist->threads,
-                                                 perf_event__process, top.session);
+       if (top->target_tid != -1)
+               perf_event__synthesize_thread_map(&top->tool, top->evlist->threads,
+                                                 perf_event__process,
+                                                 &top->session->host_machine);
        else
-               perf_event__synthesize_threads(perf_event__process, top.session);
-
-       start_counters(top.evlist);
-       top.session->evlist = top.evlist;
-       perf_session__update_sample_type(top.session);
+               perf_event__synthesize_threads(&top->tool, perf_event__process,
+                                              &top->session->host_machine);
+       perf_top__start_counters(top);
+       top->session->evlist = top->evlist;
+       perf_session__update_sample_type(top->session);
 
        /* Wait for a minimal set of events before starting the snapshot */
-       poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
+       poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 
-       perf_session__mmap_read(top.session);
+       perf_top__mmap_read(top);
 
        if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
-                                                            display_thread), NULL)) {
+                                                           display_thread), top)) {
                printf("Could not create display thread.\n");
                exit(-1);
        }
 
-       if (realtime_prio) {
+       if (top->realtime_prio) {
                struct sched_param param;
 
-               param.sched_priority = realtime_prio;
+               param.sched_priority = top->realtime_prio;
                if (sched_setscheduler(0, SCHED_FIFO, &param)) {
                        printf("Could not set realtime priority.\n");
                        exit(-1);
@@ -998,25 +979,25 @@ static int __cmd_top(void)
        }
 
        while (1) {
-               u64 hits = top.samples;
+               u64 hits = top->samples;
 
-               perf_session__mmap_read(top.session);
+               perf_top__mmap_read(top);
 
-               if (hits == top.samples)
-                       ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100);
+               if (hits == top->samples)
+                       ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
        }
 
 out_delete:
-       perf_session__delete(top.session);
-       top.session = NULL;
+       perf_session__delete(top->session);
+       top->session = NULL;
 
        return 0;
 }
 
 static int
-parse_callchain_opt(const struct option *opt __used, const char *arg,
-                   int unset)
+parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
+       struct perf_top *top = (struct perf_top *)opt->value;
        char *tok, *tok2;
        char *endptr;
 
@@ -1024,7 +1005,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
         * --no-call-graph
         */
        if (unset) {
-               dont_use_callchains = true;
+               top->dont_use_callchains = true;
                return 0;
        }
 
@@ -1052,9 +1033,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
                symbol_conf.use_callchain = false;
 
                return 0;
-       }
-
-       else
+       } else
                return -1;
 
        /* get the min percentage */
@@ -1098,17 +1077,32 @@ static const char * const top_usage[] = {
        NULL
 };
 
-static const struct option options[] = {
+int cmd_top(int argc, const char **argv, const char *prefix __used)
+{
+       struct perf_evsel *pos;
+       int status = -ENOMEM;
+       struct perf_top top = {
+               .count_filter        = 5,
+               .delay_secs          = 2,
+               .target_pid          = -1,
+               .target_tid          = -1,
+               .freq                = 1000, /* 1 KHz */
+               .sample_id_all_avail = true,
+               .mmap_pages          = 128,
+               .sym_pcnt_filter     = 5,
+       };
+       char callchain_default_opt[] = "fractal,0.5,callee";
+       const struct option options[] = {
        OPT_CALLBACK('e', "event", &top.evlist, "event",
                     "event selector. use 'perf list' to list available events",
                     parse_events_option),
-       OPT_INTEGER('c', "count", &default_interval,
+       OPT_INTEGER('c', "count", &top.default_interval,
                    "event period to sample"),
        OPT_INTEGER('p', "pid", &top.target_pid,
                    "profile events on existing process id"),
        OPT_INTEGER('t', "tid", &top.target_tid,
                    "profile events on existing thread id"),
-       OPT_BOOLEAN('a', "all-cpus", &system_wide,
+       OPT_BOOLEAN('a', "all-cpus", &top.system_wide,
                            "system-wide collection from all CPUs"),
        OPT_STRING('C', "cpu", &top.cpu_list, "cpu",
                    "list of cpus to monitor"),
@@ -1116,20 +1110,20 @@ static const struct option options[] = {
                   "file", "vmlinux pathname"),
        OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
                    "hide kernel symbols"),
-       OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
-       OPT_INTEGER('r', "realtime", &realtime_prio,
+       OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"),
+       OPT_INTEGER('r', "realtime", &top.realtime_prio,
                    "collect data with this RT SCHED_FIFO priority"),
        OPT_INTEGER('d', "delay", &top.delay_secs,
                    "number of seconds to delay between refreshes"),
-       OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+       OPT_BOOLEAN('D', "dump-symtab", &top.dump_symtab,
                            "dump the symbol table used for profiling"),
        OPT_INTEGER('f', "count-filter", &top.count_filter,
                    "only display functions with more events than this"),
-       OPT_BOOLEAN('g', "group", &group,
+       OPT_BOOLEAN('g', "group", &top.group,
                            "put the counters into a counter group"),
-       OPT_BOOLEAN('i', "inherit", &inherit,
+       OPT_BOOLEAN('i', "inherit", &top.inherit,
                    "child tasks inherit counters"),
-       OPT_STRING(0, "sym-annotate", &sym_filter, "symbol name",
+       OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
                    "symbol to annotate"),
        OPT_BOOLEAN('z', "zero", &top.zero,
                    "zero history across updates"),
@@ -1139,15 +1133,15 @@ static const struct option options[] = {
                    "display this many functions"),
        OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
                    "hide user symbols"),
-       OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"),
-       OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"),
+       OPT_BOOLEAN(0, "tui", &top.use_tui, "Use the TUI interface"),
+       OPT_BOOLEAN(0, "stdio", &top.use_stdio, "Use the stdio interface"),
        OPT_INCR('v', "verbose", &verbose,
                    "be more verbose (show counter open errors, etc)"),
        OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
                   "sort by key(s): pid, comm, dso, symbol, parent"),
        OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
                    "Show a column with the number of samples"),
-       OPT_CALLBACK_DEFAULT('G', "call-graph", NULL, "output_type,min_percent, call_order",
+       OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order",
                     "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. "
                     "Default: fractal,0.5,callee", &parse_callchain_opt,
                     callchain_default_opt),
@@ -1166,12 +1160,7 @@ static const struct option options[] = {
        OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
                   "Specify disassembler style (e.g. -M intel for intel syntax)"),
        OPT_END()
-};
-
-int cmd_top(int argc, const char **argv, const char *prefix __used)
-{
-       struct perf_evsel *pos;
-       int status = -ENOMEM;
+       };
 
        top.evlist = perf_evlist__new(NULL, NULL);
        if (top.evlist == NULL)
@@ -1188,9 +1177,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 
        setup_sorting(top_usage, options);
 
-       if (use_stdio)
+       if (top.use_stdio)
                use_browser = 0;
-       else if (use_tui)
+       else if (top.use_tui)
                use_browser = 1;
 
        setup_browser(false);
@@ -1215,38 +1204,31 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
                return -ENOMEM;
        }
 
+       symbol_conf.nr_events = top.evlist->nr_entries;
+
        if (top.delay_secs < 1)
                top.delay_secs = 1;
 
        /*
         * User specified count overrides default frequency.
         */
-       if (default_interval)
+       if (top.default_interval)
                top.freq = 0;
        else if (top.freq) {
-               default_interval = top.freq;
+               top.default_interval = top.freq;
        } else {
                fprintf(stderr, "frequency and count are zero, aborting\n");
                exit(EXIT_FAILURE);
        }
 
        list_for_each_entry(pos, &top.evlist->entries, node) {
-               if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr,
-                                        top.evlist->threads->nr) < 0)
-                       goto out_free_fd;
                /*
                 * Fill in the ones not specifically initialized via -c:
                 */
-               if (pos->attr.sample_period)
-                       continue;
-
-               pos->attr.sample_period = default_interval;
+               if (!pos->attr.sample_period)
+                       pos->attr.sample_period = top.default_interval;
        }
 
-       if (perf_evlist__alloc_pollfd(top.evlist) < 0 ||
-           perf_evlist__alloc_mmap(top.evlist) < 0)
-               goto out_free_fd;
-
        top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node);
 
        symbol_conf.priv_size = sizeof(struct annotation);
@@ -1263,16 +1245,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
         * Avoid annotation data structures overhead when symbols aren't on the
         * sort list.
         */
-       sort_has_symbols = sort_sym.list.next != NULL;
+       top.sort_has_symbols = sort_sym.list.next != NULL;
 
-       get_term_dimensions(&winsize);
+       get_term_dimensions(&top.winsize);
        if (top.print_entries == 0) {
-               update_print_entries(&winsize);
-               signal(SIGWINCH, sig_winch_handler);
+               struct sigaction act = {
+                       .sa_sigaction = perf_top__sig_winch,
+                       .sa_flags     = SA_SIGINFO,
+               };
+               perf_top__update_print_entries(&top);
+               sigaction(SIGWINCH, &act, NULL);
        }
 
-       status = __cmd_top();
-out_free_fd:
+       status = __cmd_top(&top);
+
        perf_evlist__delete(top.evlist);
 
        return status;
index 73d0cac..2b2e225 100644 (file)
@@ -29,8 +29,6 @@ struct pager_config {
        int val;
 };
 
-static char debugfs_mntpt[MAXPATHLEN];
-
 static int pager_command_config(const char *var, const char *value, void *data)
 {
        struct pager_config *c = data;
@@ -81,15 +79,6 @@ static void commit_pager_choice(void)
        }
 }
 
-static void set_debugfs_path(void)
-{
-       char *path;
-
-       path = getenv(PERF_DEBUGFS_ENVIRONMENT);
-       snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt,
-                "tracing/events");
-}
-
 static int handle_options(const char ***argv, int *argc, int *envchanged)
 {
        int handled = 0;
@@ -161,15 +150,14 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
                                fprintf(stderr, "No directory given for --debugfs-dir.\n");
                                usage(perf_usage_string);
                        }
-                       strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN);
-                       debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+                       debugfs_set_path((*argv)[1]);
                        if (envchanged)
                                *envchanged = 1;
                        (*argv)++;
                        (*argc)--;
                } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) {
-                       strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN);
-                       debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+                       debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR));
+                       fprintf(stderr, "dir: %s\n", debugfs_mountpoint);
                        if (envchanged)
                                *envchanged = 1;
                } else {
@@ -281,7 +269,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
        if (use_pager == -1 && p->option & USE_PAGER)
                use_pager = 1;
        commit_pager_choice();
-       set_debugfs_path();
 
        status = p->fn(argc, argv, prefix);
        exit_browser(status);
@@ -416,17 +403,6 @@ static int run_argv(int *argcp, const char ***argv)
        return done_alias;
 }
 
-/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */
-static void get_debugfs_mntpt(void)
-{
-       const char *path = debugfs_mount(NULL);
-
-       if (path)
-               strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt));
-       else
-               debugfs_mntpt[0] = '\0';
-}
-
 static void pthread__block_sigwinch(void)
 {
        sigset_t set;
@@ -453,7 +429,7 @@ int main(int argc, const char **argv)
        if (!cmd)
                cmd = "perf-help";
        /* get debugfs mount point from /proc/mounts */
-       get_debugfs_mntpt();
+       debugfs_mount(NULL);
        /*
         * "perf-xxxx" is the same as "perf xxxx", but we obviously:
         *
@@ -476,7 +452,6 @@ int main(int argc, const char **argv)
        argc--;
        handle_options(&argv, &argc, NULL);
        commit_pager_choice();
-       set_debugfs_path();
        set_buildid_dir();
 
        if (argc > 0) {
index 914c895..64f8bee 100644 (file)
@@ -185,4 +185,28 @@ extern const char perf_version_string[];
 
 void pthread__unblock_sigwinch(void);
 
+struct perf_record_opts {
+       pid_t        target_pid;
+       pid_t        target_tid;
+       bool         call_graph;
+       bool         group;
+       bool         inherit_stat;
+       bool         no_delay;
+       bool         no_inherit;
+       bool         no_samples;
+       bool         pipe_output;
+       bool         raw_samples;
+       bool         sample_address;
+       bool         sample_time;
+       bool         sample_id_all_avail;
+       bool         system_wide;
+       bool         period;
+       unsigned int freq;
+       unsigned int mmap_pages;
+       unsigned int user_freq;
+       u64          default_interval;
+       u64          user_interval;
+       const char   *cpu_list;
+};
+
 #endif
index 119e996..011ed26 100644 (file)
@@ -25,17 +25,17 @@ int symbol__annotate_init(struct map *map __used, struct symbol *sym)
        return 0;
 }
 
-int symbol__alloc_hist(struct symbol *sym, int nevents)
+int symbol__alloc_hist(struct symbol *sym)
 {
        struct annotation *notes = symbol__annotation(sym);
        size_t sizeof_sym_hist = (sizeof(struct sym_hist) +
                                  (sym->end - sym->start) * sizeof(u64));
 
-       notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist);
+       notes->src = zalloc(sizeof(*notes->src) + symbol_conf.nr_events * sizeof_sym_hist);
        if (notes->src == NULL)
                return -1;
        notes->src->sizeof_sym_hist = sizeof_sym_hist;
-       notes->src->nr_histograms   = nevents;
+       notes->src->nr_histograms   = symbol_conf.nr_events;
        INIT_LIST_HEAD(&notes->src->source);
        return 0;
 }
@@ -334,7 +334,7 @@ fallback:
                 disassembler_style ? "-M " : "",
                 disassembler_style ? disassembler_style : "",
                 map__rip_2objdump(map, sym->start),
-                map__rip_2objdump(map, sym->end),
+                map__rip_2objdump(map, sym->end+1),
                 symbol_conf.annotate_asm_raw ? "" : "--no-show-raw",
                 symbol_conf.annotate_src ? "-S" : "",
                 symfs_filename, filename);
index d907252..efa5dc8 100644 (file)
@@ -72,7 +72,7 @@ static inline struct annotation *symbol__annotation(struct symbol *sym)
 
 int symbol__inc_addr_samples(struct symbol *sym, struct map *map,
                             int evidx, u64 addr);
-int symbol__alloc_hist(struct symbol *sym, int nevents);
+int symbol__alloc_hist(struct symbol *sym);
 void symbol__annotate_zero_histograms(struct symbol *sym);
 
 int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize);
@@ -99,8 +99,7 @@ static inline int symbol__tui_annotate(struct symbol *sym __used,
 }
 #else
 int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
-                        int nr_events, void(*timer)(void *arg), void *arg,
-                        int delay_secs);
+                        void(*timer)(void *arg), void *arg, int delay_secs);
 #endif
 
 extern const char      *disassembler_style;
index a91cd99..dff9c7a 100644 (file)
 #include "symbol.h"
 #include <linux/kernel.h>
 #include "debug.h"
+#include "session.h"
+#include "tool.h"
 
-static int build_id__mark_dso_hit(union perf_event *event,
+static int build_id__mark_dso_hit(struct perf_tool *tool __used,
+                                 union perf_event *event,
                                  struct perf_sample *sample __used,
                                  struct perf_evsel *evsel __used,
-                                 struct perf_session *session)
+                                 struct machine *machine)
 {
        struct addr_location al;
        u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-       struct thread *thread = perf_session__findnew(session, event->ip.pid);
+       struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
 
        if (thread == NULL) {
                pr_err("problem processing %d event, skipping it.\n",
@@ -29,8 +32,8 @@ static int build_id__mark_dso_hit(union perf_event *event,
                return -1;
        }
 
-       thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-                             event->ip.pid, event->ip.ip, &al);
+       thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+                             event->ip.ip, &al);
 
        if (al.map != NULL)
                al.map->dso->hit = 1;
@@ -38,25 +41,26 @@ static int build_id__mark_dso_hit(union perf_event *event,
        return 0;
 }
 
-static int perf_event__exit_del_thread(union perf_event *event,
+static int perf_event__exit_del_thread(struct perf_tool *tool __used,
+                                      union perf_event *event,
                                       struct perf_sample *sample __used,
-                                      struct perf_session *session)
+                                      struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(session, event->fork.tid);
+       struct thread *thread = machine__findnew_thread(machine, event->fork.tid);
 
        dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
                    event->fork.ppid, event->fork.ptid);
 
        if (thread) {
-               rb_erase(&thread->rb_node, &session->threads);
-               session->last_match = NULL;
+               rb_erase(&thread->rb_node, &machine->threads);
+               machine->last_match = NULL;
                thread__delete(thread);
        }
 
        return 0;
 }
 
-struct perf_event_ops build_id__mark_dso_hit_ops = {
+struct perf_tool build_id__mark_dso_hit_ops = {
        .sample = build_id__mark_dso_hit,
        .mmap   = perf_event__process_mmap,
        .fork   = perf_event__process_task,
index 5dafb00..a993ba8 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "session.h"
 
-extern struct perf_event_ops build_id__mark_dso_hit_ops;
+extern struct perf_tool build_id__mark_dso_hit_ops;
 
 char *dso__build_id_filename(struct dso *self, char *bf, size_t size);
 
index 9b4ff16..7f9c0f1 100644 (file)
@@ -101,6 +101,9 @@ int callchain_append(struct callchain_root *root,
 int callchain_merge(struct callchain_cursor *cursor,
                    struct callchain_root *dst, struct callchain_root *src);
 
+struct ip_callchain;
+union perf_event;
+
 bool ip_callchain__valid(struct ip_callchain *chain,
                         const union perf_event *event);
 /*
index 96bee5c..dbe2f16 100644 (file)
@@ -3,7 +3,6 @@
 #include "parse-options.h"
 #include "evsel.h"
 #include "cgroup.h"
-#include "debugfs.h" /* MAX_PATH, STR() */
 #include "evlist.h"
 
 int nr_cgroups;
@@ -12,7 +11,7 @@ static int
 cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 {
        FILE *fp;
-       char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1];
+       char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1];
        char *token, *saved_ptr = NULL;
        int found = 0;
 
@@ -25,8 +24,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
         * and inspect every cgroupfs mount point to find one that has
         * perf_event subsystem
         */
-       while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %"
-                               STR(MAX_PATH)"s %*d %*d\n",
+       while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %"
+                               STR(PATH_MAX)"s %*d %*d\n",
                                mountpoint, type, tokens) == 3) {
 
                if (!strcmp(type, "cgroup")) {
@@ -57,15 +56,15 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen)
 
 static int open_cgroup(char *name)
 {
-       char path[MAX_PATH+1];
-       char mnt[MAX_PATH+1];
+       char path[PATH_MAX + 1];
+       char mnt[PATH_MAX + 1];
        int fd;
 
 
-       if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1))
+       if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1))
                return -1;
 
-       snprintf(path, MAX_PATH, "%s/%s", mnt, name);
+       snprintf(path, PATH_MAX, "%s/%s", mnt, name);
 
        fd = open(path, O_RDONLY);
        if (fd == -1)
index 80d9598..0deac6a 100644 (file)
@@ -1,5 +1,8 @@
 /*
- * GIT - The information manager from hell
+ * config.c
+ *
+ * Helper functions for parsing config items.
+ * Originally copied from GIT source.
  *
  * Copyright (C) Linus Torvalds, 2005
  * Copyright (C) Johannes Schindelin, 2005
index a88fefc..ffc35e7 100644 (file)
@@ -2,8 +2,12 @@
 #include "debugfs.h"
 #include "cache.h"
 
+#include <linux/kernel.h>
+#include <sys/mount.h>
+
 static int debugfs_premounted;
-static char debugfs_mountpoint[MAX_PATH+1];
+char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug";
+char tracing_events_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing/events";
 
 static const char *debugfs_known_mountpoints[] = {
        "/sys/kernel/debug/",
@@ -62,11 +66,9 @@ const char *debugfs_find_mountpoint(void)
        /* give up and parse /proc/mounts */
        fp = fopen("/proc/mounts", "r");
        if (fp == NULL)
-               die("Can't open /proc/mounts for read");
+               return NULL;
 
-       while (fscanf(fp, "%*s %"
-                     STR(MAX_PATH)
-                     "s %99s %*s %*d %*d\n",
+       while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n",
                      debugfs_mountpoint, type) == 2) {
                if (strcmp(type, "debugfs") == 0)
                        break;
@@ -106,6 +108,12 @@ int debugfs_valid_entry(const char *path)
        return 0;
 }
 
+static void debugfs_set_tracing_events_path(const char *mountpoint)
+{
+       snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s",
+                mountpoint, "tracing/events");
+}
+
 /* mount the debugfs somewhere if it's not mounted */
 
 char *debugfs_mount(const char *mountpoint)
@@ -113,7 +121,7 @@ char *debugfs_mount(const char *mountpoint)
        /* see if it's already mounted */
        if (debugfs_find_mountpoint()) {
                debugfs_premounted = 1;
-               return debugfs_mountpoint;
+               goto out;
        }
 
        /* if not mounted and no argument */
@@ -129,12 +137,19 @@ char *debugfs_mount(const char *mountpoint)
                return NULL;
 
        /* save the mountpoint */
-       strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
        debugfs_found = 1;
-
+       strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
+out:
+       debugfs_set_tracing_events_path(debugfs_mountpoint);
        return debugfs_mountpoint;
 }
 
+void debugfs_set_path(const char *mountpoint)
+{
+       snprintf(debugfs_mountpoint, sizeof(debugfs_mountpoint), "%s", mountpoint);
+       debugfs_set_tracing_events_path(mountpoint);
+}
+
 /* umount the debugfs */
 
 int debugfs_umount(void)
@@ -158,7 +173,7 @@ int debugfs_umount(void)
 
 int debugfs_write(const char *entry, const char *value)
 {
-       char path[MAX_PATH+1];
+       char path[PATH_MAX + 1];
        int ret, count;
        int fd;
 
@@ -203,7 +218,7 @@ int debugfs_write(const char *entry, const char *value)
  */
 int debugfs_read(const char *entry, char *buffer, size_t size)
 {
-       char path[MAX_PATH+1];
+       char path[PATH_MAX + 1];
        int ret;
        int fd;
 
index 83a0287..4a878f7 100644 (file)
@@ -1,25 +1,18 @@
 #ifndef __DEBUGFS_H__
 #define __DEBUGFS_H__
 
-#include <sys/mount.h>
+const char *debugfs_find_mountpoint(void);
+int debugfs_valid_mountpoint(const char *debugfs);
+int debugfs_valid_entry(const char *path);
+char *debugfs_mount(const char *mountpoint);
+int debugfs_umount(void);
+void debugfs_set_path(const char *mountpoint);
+int debugfs_write(const char *entry, const char *value);
+int debugfs_read(const char *entry, char *buffer, size_t size);
+void debugfs_force_cleanup(void);
+int debugfs_make_path(const char *element, char *buffer, int size);
 
-#ifndef MAX_PATH
-# define MAX_PATH 256
-#endif
-
-#ifndef STR
-# define _STR(x) #x
-# define STR(x) _STR(x)
-#endif
-
-extern const char *debugfs_find_mountpoint(void);
-extern int debugfs_valid_mountpoint(const char *debugfs);
-extern int debugfs_valid_entry(const char *path);
-extern char *debugfs_mount(const char *mountpoint);
-extern int debugfs_umount(void);
-extern int debugfs_write(const char *entry, const char *value);
-extern int debugfs_read(const char *entry, char *buffer, size_t size);
-extern void debugfs_force_cleanup(void);
-extern int debugfs_make_path(const char *element, char *buffer, int size);
+extern char debugfs_mountpoint[];
+extern char tracing_events_path[];
 
 #endif /* __DEBUGFS_H__ */
index 437f8ca..73ddaf0 100644 (file)
@@ -1,7 +1,6 @@
 #include <linux/types.h>
 #include "event.h"
 #include "debug.h"
-#include "session.h"
 #include "sort.h"
 #include "string.h"
 #include "strlist.h"
@@ -44,36 +43,27 @@ static struct perf_sample synth_sample = {
        .period    = 1,
 };
 
-static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid,
-                                        int full, perf_event__handler_t process,
-                                        struct perf_session *session)
+static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len)
 {
        char filename[PATH_MAX];
        char bf[BUFSIZ];
        FILE *fp;
        size_t size = 0;
-       DIR *tasks;
-       struct dirent dirent, *next;
-       pid_t tgid = 0;
+       pid_t tgid = -1;
 
        snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
 
        fp = fopen(filename, "r");
        if (fp == NULL) {
-out_race:
-               /*
-                * We raced with a task exiting - just return:
-                */
                pr_debug("couldn't open %s\n", filename);
                return 0;
        }
 
-       memset(&event->comm, 0, sizeof(event->comm));
-
-       while (!event->comm.comm[0] || !event->comm.pid) {
+       while (!comm[0] || (tgid < 0)) {
                if (fgets(bf, sizeof(bf), fp) == NULL) {
-                       pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
-                       goto out;
+                       pr_warning("couldn't get COMM and pgid, malformed %s\n",
+                                  filename);
+                       break;
                }
 
                if (memcmp(bf, "Name:", 5) == 0) {
@@ -81,33 +71,65 @@ out_race:
                        while (*name && isspace(*name))
                                ++name;
                        size = strlen(name) - 1;
-                       memcpy(event->comm.comm, name, size++);
+                       if (size >= len)
+                               size = len - 1;
+                       memcpy(comm, name, size);
+
                } else if (memcmp(bf, "Tgid:", 5) == 0) {
                        char *tgids = bf + 5;
                        while (*tgids && isspace(*tgids))
                                ++tgids;
-                       tgid = event->comm.pid = atoi(tgids);
+                       tgid = atoi(tgids);
                }
        }
 
+       fclose(fp);
+
+       return tgid;
+}
+
+static pid_t perf_event__synthesize_comm(struct perf_tool *tool,
+                                        union perf_event *event, pid_t pid,
+                                        int full,
+                                        perf_event__handler_t process,
+                                        struct machine *machine)
+{
+       char filename[PATH_MAX];
+       size_t size;
+       DIR *tasks;
+       struct dirent dirent, *next;
+       pid_t tgid;
+
+       memset(&event->comm, 0, sizeof(event->comm));
+
+       tgid = perf_event__get_comm_tgid(pid, event->comm.comm,
+                                        sizeof(event->comm.comm));
+       if (tgid < 0)
+               goto out;
+
+       event->comm.pid = tgid;
        event->comm.header.type = PERF_RECORD_COMM;
+
+       size = strlen(event->comm.comm) + 1;
        size = ALIGN(size, sizeof(u64));
-       memset(event->comm.comm + size, 0, session->id_hdr_size);
+       memset(event->comm.comm + size, 0, machine->id_hdr_size);
        event->comm.header.size = (sizeof(event->comm) -
                                (sizeof(event->comm.comm) - size) +
-                               session->id_hdr_size);
+                               machine->id_hdr_size);
        if (!full) {
                event->comm.tid = pid;
 
-               process(event, &synth_sample, session);
+               process(tool, event, &synth_sample, machine);
                goto out;
        }
 
        snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
 
        tasks = opendir(filename);
-       if (tasks == NULL)
-               goto out_race;
+       if (tasks == NULL) {
+               pr_debug("couldn't open %s\n", filename);
+               return 0;
+       }
 
        while (!readdir_r(tasks, &dirent, &next) && next) {
                char *end;
@@ -115,22 +137,32 @@ out_race:
                if (*end)
                        continue;
 
+               /* already have tgid; jut want to update the comm */
+               (void) perf_event__get_comm_tgid(pid, event->comm.comm,
+                                        sizeof(event->comm.comm));
+
+               size = strlen(event->comm.comm) + 1;
+               size = ALIGN(size, sizeof(u64));
+               memset(event->comm.comm + size, 0, machine->id_hdr_size);
+               event->comm.header.size = (sizeof(event->comm) -
+                                         (sizeof(event->comm.comm) - size) +
+                                         machine->id_hdr_size);
+
                event->comm.tid = pid;
 
-               process(event, &synth_sample, session);
+               process(tool, event, &synth_sample, machine);
        }
 
        closedir(tasks);
 out:
-       fclose(fp);
-
        return tgid;
 }
 
-static int perf_event__synthesize_mmap_events(union perf_event *event,
+static int perf_event__synthesize_mmap_events(struct perf_tool *tool,
+                                             union perf_event *event,
                                              pid_t pid, pid_t tgid,
                                              perf_event__handler_t process,
-                                             struct perf_session *session)
+                                             struct machine *machine)
 {
        char filename[PATH_MAX];
        FILE *fp;
@@ -193,12 +225,12 @@ static int perf_event__synthesize_mmap_events(union perf_event *event,
                        event->mmap.len -= event->mmap.start;
                        event->mmap.header.size = (sizeof(event->mmap) -
                                                (sizeof(event->mmap.filename) - size));
-                       memset(event->mmap.filename + size, 0, session->id_hdr_size);
-                       event->mmap.header.size += session->id_hdr_size;
+                       memset(event->mmap.filename + size, 0, machine->id_hdr_size);
+                       event->mmap.header.size += machine->id_hdr_size;
                        event->mmap.pid = tgid;
                        event->mmap.tid = pid;
 
-                       process(event, &synth_sample, session);
+                       process(tool, event, &synth_sample, machine);
                }
        }
 
@@ -206,14 +238,14 @@ static int perf_event__synthesize_mmap_events(union perf_event *event,
        return 0;
 }
 
-int perf_event__synthesize_modules(perf_event__handler_t process,
-                                  struct perf_session *session,
+int perf_event__synthesize_modules(struct perf_tool *tool,
+                                  perf_event__handler_t process,
                                   struct machine *machine)
 {
        struct rb_node *nd;
        struct map_groups *kmaps = &machine->kmaps;
        union perf_event *event = zalloc((sizeof(event->mmap) +
-                                         session->id_hdr_size));
+                                         machine->id_hdr_size));
        if (event == NULL) {
                pr_debug("Not enough memory synthesizing mmap event "
                         "for kernel modules\n");
@@ -243,15 +275,15 @@ int perf_event__synthesize_modules(perf_event__handler_t process,
                event->mmap.header.type = PERF_RECORD_MMAP;
                event->mmap.header.size = (sizeof(event->mmap) -
                                        (sizeof(event->mmap.filename) - size));
-               memset(event->mmap.filename + size, 0, session->id_hdr_size);
-               event->mmap.header.size += session->id_hdr_size;
+               memset(event->mmap.filename + size, 0, machine->id_hdr_size);
+               event->mmap.header.size += machine->id_hdr_size;
                event->mmap.start = pos->start;
                event->mmap.len   = pos->end - pos->start;
                event->mmap.pid   = machine->pid;
 
                memcpy(event->mmap.filename, pos->dso->long_name,
                       pos->dso->long_name_len + 1);
-               process(event, &synth_sample, session);
+               process(tool, event, &synth_sample, machine);
        }
 
        free(event);
@@ -260,40 +292,69 @@ int perf_event__synthesize_modules(perf_event__handler_t process,
 
 static int __event__synthesize_thread(union perf_event *comm_event,
                                      union perf_event *mmap_event,
-                                     pid_t pid, perf_event__handler_t process,
-                                     struct perf_session *session)
+                                     pid_t pid, int full,
+                                         perf_event__handler_t process,
+                                     struct perf_tool *tool,
+                                     struct machine *machine)
 {
-       pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process,
-                                           session);
+       pid_t tgid = perf_event__synthesize_comm(tool, comm_event, pid, full,
+                                                process, machine);
        if (tgid == -1)
                return -1;
-       return perf_event__synthesize_mmap_events(mmap_event, pid, tgid,
-                                            process, session);
+       return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
+                                                 process, machine);
 }
 
-int perf_event__synthesize_thread_map(struct thread_map *threads,
+int perf_event__synthesize_thread_map(struct perf_tool *tool,
+                                     struct thread_map *threads,
                                      perf_event__handler_t process,
-                                     struct perf_session *session)
+                                     struct machine *machine)
 {
        union perf_event *comm_event, *mmap_event;
-       int err = -1, thread;
+       int err = -1, thread, j;
 
-       comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+       comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
        if (comm_event == NULL)
                goto out;
 
-       mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+       mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size);
        if (mmap_event == NULL)
                goto out_free_comm;
 
        err = 0;
        for (thread = 0; thread < threads->nr; ++thread) {
                if (__event__synthesize_thread(comm_event, mmap_event,
-                                              threads->map[thread],
-                                              process, session)) {
+                                              threads->map[thread], 0,
+                                              process, tool, machine)) {
                        err = -1;
                        break;
                }
+
+               /*
+                * comm.pid is set to thread group id by
+                * perf_event__synthesize_comm
+                */
+               if ((int) comm_event->comm.pid != threads->map[thread]) {
+                       bool need_leader = true;
+
+                       /* is thread group leader in thread_map? */
+                       for (j = 0; j < threads->nr; ++j) {
+                               if ((int) comm_event->comm.pid == threads->map[j]) {
+                                       need_leader = false;
+                                       break;
+                               }
+                       }
+
+                       /* if not, generate events for it */
+                       if (need_leader &&
+                           __event__synthesize_thread(comm_event,
+                                                     mmap_event,
+                                                     comm_event->comm.pid, 0,
+                                                     process, tool, machine)) {
+                               err = -1;
+                               break;
+                       }
+               }
        }
        free(mmap_event);
 out_free_comm:
@@ -302,19 +363,20 @@ out:
        return err;
 }
 
-int perf_event__synthesize_threads(perf_event__handler_t process,
-                                  struct perf_session *session)
+int perf_event__synthesize_threads(struct perf_tool *tool,
+                                  perf_event__handler_t process,
+                                  struct machine *machine)
 {
        DIR *proc;
        struct dirent dirent, *next;
        union perf_event *comm_event, *mmap_event;
        int err = -1;
 
-       comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+       comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
        if (comm_event == NULL)
                goto out;
 
-       mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+       mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size);
        if (mmap_event == NULL)
                goto out_free_comm;
 
@@ -329,8 +391,8 @@ int perf_event__synthesize_threads(perf_event__handler_t process,
                if (*end) /* only interested in proper numerical dirents */
                        continue;
 
-               __event__synthesize_thread(comm_event, mmap_event, pid,
-                                          process, session);
+               __event__synthesize_thread(comm_event, mmap_event, pid, 1,
+                                          process, tool, machine);
        }
 
        closedir(proc);
@@ -365,8 +427,8 @@ static int find_symbol_cb(void *arg, const char *name, char type,
        return 1;
 }
 
-int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
-                                      struct perf_session *session,
+int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+                                      perf_event__handler_t process,
                                       struct machine *machine,
                                       const char *symbol_name)
 {
@@ -383,7 +445,7 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
         */
        struct process_symbol_args args = { .name = symbol_name, };
        union perf_event *event = zalloc((sizeof(event->mmap) +
-                                         session->id_hdr_size));
+                                         machine->id_hdr_size));
        if (event == NULL) {
                pr_debug("Not enough memory synthesizing mmap event "
                         "for kernel modules\n");
@@ -417,25 +479,32 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
        size = ALIGN(size, sizeof(u64));
        event->mmap.header.type = PERF_RECORD_MMAP;
        event->mmap.header.size = (sizeof(event->mmap) -
-                       (sizeof(event->mmap.filename) - size) + session->id_hdr_size);
+                       (sizeof(event->mmap.filename) - size) + machine->id_hdr_size);
        event->mmap.pgoff = args.start;
        event->mmap.start = map->start;
        event->mmap.len   = map->end - event->mmap.start;
        event->mmap.pid   = machine->pid;
 
-       err = process(event, &synth_sample, session);
+       err = process(tool, event, &synth_sample, machine);
        free(event);
 
        return err;
 }
 
-int perf_event__process_comm(union perf_event *event,
+size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
+{
+       return fprintf(fp, ": %s:%d\n", event->comm.comm, event->comm.tid);
+}
+
+int perf_event__process_comm(struct perf_tool *tool __used,
+                            union perf_event *event,
                             struct perf_sample *sample __used,
-                            struct perf_session *session)
+                            struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(session, event->comm.tid);
+       struct thread *thread = machine__findnew_thread(machine, event->comm.tid);
 
-       dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid);
+       if (dump_trace)
+               perf_event__fprintf_comm(event, stdout);
 
        if (thread == NULL || thread__set_comm(thread, event->comm.comm)) {
                dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n");
@@ -445,13 +514,13 @@ int perf_event__process_comm(union perf_event *event,
        return 0;
 }
 
-int perf_event__process_lost(union perf_event *event,
+int perf_event__process_lost(struct perf_tool *tool __used,
+                            union perf_event *event,
                             struct perf_sample *sample __used,
-                            struct perf_session *session)
+                            struct machine *machine __used)
 {
        dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n",
                    event->lost.id, event->lost.lost);
-       session->hists.stats.total_lost += event->lost.lost;
        return 0;
 }
 
@@ -468,21 +537,15 @@ static void perf_event__set_kernel_mmap_len(union perf_event *event,
                maps[MAP__FUNCTION]->end = ~0ULL;
 }
 
-static int perf_event__process_kernel_mmap(union perf_event *event,
-                                          struct perf_session *session)
+static int perf_event__process_kernel_mmap(struct perf_tool *tool __used,
+                                          union perf_event *event,
+                                          struct machine *machine)
 {
        struct map *map;
        char kmmap_prefix[PATH_MAX];
-       struct machine *machine;
        enum dso_kernel_type kernel_type;
        bool is_kernel_mmap;
 
-       machine = perf_session__findnew_machine(session, event->mmap.pid);
-       if (!machine) {
-               pr_err("Can't find id %d's machine\n", event->mmap.pid);
-               goto out_problem;
-       }
-
        machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix));
        if (machine__is_host(machine))
                kernel_type = DSO_TYPE_KERNEL;
@@ -549,9 +612,9 @@ static int perf_event__process_kernel_mmap(union perf_event *event,
                 * time /proc/sys/kernel/kptr_restrict was non zero.
                 */
                if (event->mmap.pgoff != 0) {
-                       perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
-                                                                symbol_name,
-                                                                event->mmap.pgoff);
+                       maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
+                                                        symbol_name,
+                                                        event->mmap.pgoff);
                }
 
                if (machine__is_default_guest(machine)) {
@@ -567,32 +630,35 @@ out_problem:
        return -1;
 }
 
-int perf_event__process_mmap(union perf_event *event,
+size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp)
+{
+       return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n",
+                      event->mmap.pid, event->mmap.tid, event->mmap.start,
+                      event->mmap.len, event->mmap.pgoff, event->mmap.filename);
+}
+
+int perf_event__process_mmap(struct perf_tool *tool,
+                            union perf_event *event,
                             struct perf_sample *sample __used,
-                            struct perf_session *session)
+                            struct machine *machine)
 {
-       struct machine *machine;
        struct thread *thread;
        struct map *map;
        u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
        int ret = 0;
 
-       dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n",
-                       event->mmap.pid, event->mmap.tid, event->mmap.start,
-                       event->mmap.len, event->mmap.pgoff, event->mmap.filename);
+       if (dump_trace)
+               perf_event__fprintf_mmap(event, stdout);
 
        if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
            cpumode == PERF_RECORD_MISC_KERNEL) {
-               ret = perf_event__process_kernel_mmap(event, session);
+               ret = perf_event__process_kernel_mmap(tool, event, machine);
                if (ret < 0)
                        goto out_problem;
                return 0;
        }
 
-       machine = perf_session__find_host_machine(session);
-       if (machine == NULL)
-               goto out_problem;
-       thread = perf_session__findnew(session, event->mmap.pid);
+       thread = machine__findnew_thread(machine, event->mmap.pid);
        if (thread == NULL)
                goto out_problem;
        map = map__new(&machine->user_dsos, event->mmap.start,
@@ -610,18 +676,26 @@ out_problem:
        return 0;
 }
 
-int perf_event__process_task(union perf_event *event,
+size_t perf_event__fprintf_task(union perf_event *event, FILE *fp)
+{
+       return fprintf(fp, "(%d:%d):(%d:%d)\n",
+                      event->fork.pid, event->fork.tid,
+                      event->fork.ppid, event->fork.ptid);
+}
+
+int perf_event__process_task(struct perf_tool *tool __used,
+                            union perf_event *event,
                             struct perf_sample *sample __used,
-                            struct perf_session *session)
+                             struct machine *machine)
 {
-       struct thread *thread = perf_session__findnew(session, event->fork.tid);
-       struct thread *parent = perf_session__findnew(session, event->fork.ptid);
+       struct thread *thread = machine__findnew_thread(machine, event->fork.tid);
+       struct thread *parent = machine__findnew_thread(machine, event->fork.ptid);
 
-       dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid,
-                   event->fork.ppid, event->fork.ptid);
+       if (dump_trace)
+               perf_event__fprintf_task(event, stdout);
 
        if (event->header.type == PERF_RECORD_EXIT) {
-               perf_session__remove_thread(session, thread);
+               machine__remove_thread(machine, thread);
                return 0;
        }
 
@@ -634,22 +708,45 @@ int perf_event__process_task(union perf_event *event,
        return 0;
 }
 
-int perf_event__process(union perf_event *event, struct perf_sample *sample,
-                       struct perf_session *session)
+size_t perf_event__fprintf(union perf_event *event, FILE *fp)
+{
+       size_t ret = fprintf(fp, "PERF_RECORD_%s",
+                            perf_event__name(event->header.type));
+
+       switch (event->header.type) {
+       case PERF_RECORD_COMM:
+               ret += perf_event__fprintf_comm(event, fp);
+               break;
+       case PERF_RECORD_FORK:
+       case PERF_RECORD_EXIT:
+               ret += perf_event__fprintf_task(event, fp);
+               break;
+       case PERF_RECORD_MMAP:
+               ret += perf_event__fprintf_mmap(event, fp);
+               break;
+       default:
+               ret += fprintf(fp, "\n");
+       }
+
+       return ret;
+}
+
+int perf_event__process(struct perf_tool *tool, union perf_event *event,
+                       struct perf_sample *sample, struct machine *machine)
 {
        switch (event->header.type) {
        case PERF_RECORD_COMM:
-               perf_event__process_comm(event, sample, session);
+               perf_event__process_comm(tool, event, sample, machine);
                break;
        case PERF_RECORD_MMAP:
-               perf_event__process_mmap(event, sample, session);
+               perf_event__process_mmap(tool, event, sample, machine);
                break;
        case PERF_RECORD_FORK:
        case PERF_RECORD_EXIT:
-               perf_event__process_task(event, sample, session);
+               perf_event__process_task(tool, event, sample, machine);
                break;
        case PERF_RECORD_LOST:
-               perf_event__process_lost(event, sample, session);
+               perf_event__process_lost(tool, event, sample, machine);
        default:
                break;
        }
@@ -658,36 +755,29 @@ int perf_event__process(union perf_event *event, struct perf_sample *sample,
 }
 
 void thread__find_addr_map(struct thread *self,
-                          struct perf_session *session, u8 cpumode,
-                          enum map_type type, pid_t pid, u64 addr,
+                          struct machine *machine, u8 cpumode,
+                          enum map_type type, u64 addr,
                           struct addr_location *al)
 {
        struct map_groups *mg = &self->mg;
-       struct machine *machine = NULL;
 
        al->thread = self;
        al->addr = addr;
        al->cpumode = cpumode;
        al->filtered = false;
 
+       if (machine == NULL) {
+               al->map = NULL;
+               return;
+       }
+
        if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
                al->level = 'k';
-               machine = perf_session__find_host_machine(session);
-               if (machine == NULL) {
-                       al->map = NULL;
-                       return;
-               }
                mg = &machine->kmaps;
        } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) {
                al->level = '.';
-               machine = perf_session__find_host_machine(session);
        } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) {
                al->level = 'g';
-               machine = perf_session__find_machine(session, pid);
-               if (machine == NULL) {
-                       al->map = NULL;
-                       return;
-               }
                mg = &machine->kmaps;
        } else {
                /*
@@ -733,13 +823,12 @@ try_again:
                al->addr = al->map->map_ip(al->map, al->addr);
 }
 
-void thread__find_addr_location(struct thread *self,
-                               struct perf_session *session, u8 cpumode,
-                               enum map_type type, pid_t pid, u64 addr,
+void thread__find_addr_location(struct thread *thread, struct machine *machine,
+                               u8 cpumode, enum map_type type, u64 addr,
                                struct addr_location *al,
                                symbol_filter_t filter)
 {
-       thread__find_addr_map(self, session, cpumode, type, pid, addr, al);
+       thread__find_addr_map(thread, machine, cpumode, type, addr, al);
        if (al->map != NULL)
                al->sym = map__find_symbol(al->map, al->addr, filter);
        else
@@ -747,13 +836,13 @@ void thread__find_addr_location(struct thread *self,
 }
 
 int perf_event__preprocess_sample(const union perf_event *event,
-                                 struct perf_session *session,
+                                 struct machine *machine,
                                  struct addr_location *al,
                                  struct perf_sample *sample,
                                  symbol_filter_t filter)
 {
        u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-       struct thread *thread = perf_session__findnew(session, event->ip.pid);
+       struct thread *thread = machine__findnew_thread(machine, event->ip.pid);
 
        if (thread == NULL)
                return -1;
@@ -764,18 +853,18 @@ int perf_event__preprocess_sample(const union perf_event *event,
 
        dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
        /*
-        * Have we already created the kernel maps for the host machine?
+        * Have we already created the kernel maps for this machine?
         *
         * This should have happened earlier, when we processed the kernel MMAP
         * events, but for older perf.data files there was no such thing, so do
         * it now.
         */
        if (cpumode == PERF_RECORD_MISC_KERNEL &&
-           session->host_machine.vmlinux_maps[MAP__FUNCTION] == NULL)
-               machine__create_kernel_maps(&session->host_machine);
+           machine->vmlinux_maps[MAP__FUNCTION] == NULL)
+               machine__create_kernel_maps(machine);
 
-       thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-                             event->ip.pid, event->ip.ip, al);
+       thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+                             event->ip.ip, al);
        dump_printf(" ...... dso: %s\n",
                    al->map ? al->map->dso->long_name :
                        al->level == 'H' ? "[hypervisor]" : "<not found>");
@@ -783,13 +872,14 @@ int perf_event__preprocess_sample(const union perf_event *event,
        al->cpu = sample->cpu;
 
        if (al->map) {
+               struct dso *dso = al->map->dso;
+
                if (symbol_conf.dso_list &&
-                   (!al->map || !al->map->dso ||
-                    !(strlist__has_entry(symbol_conf.dso_list,
-                                         al->map->dso->short_name) ||
-                      (al->map->dso->short_name != al->map->dso->long_name &&
-                       strlist__has_entry(symbol_conf.dso_list,
-                                          al->map->dso->long_name)))))
+                   (!dso || !(strlist__has_entry(symbol_conf.dso_list,
+                                                 dso->short_name) ||
+                              (dso->short_name != dso->long_name &&
+                               strlist__has_entry(symbol_conf.dso_list,
+                                                  dso->long_name)))))
                        goto out_filtered;
 
                al->sym = map__find_symbol(al->map, al->addr, filter);
index 357a85b..cbdeaad 100644 (file)
@@ -2,6 +2,7 @@
 #define __PERF_RECORD_H
 
 #include <limits.h>
+#include <stdio.h>
 
 #include "../perf.h"
 #include "map.h"
@@ -141,43 +142,54 @@ union perf_event {
 
 void perf_event__print_totals(void);
 
-struct perf_session;
+struct perf_tool;
 struct thread_map;
 
-typedef int (*perf_event__handler_synth_t)(union perf_event *event, 
-                                          struct perf_session *session);
-typedef int (*perf_event__handler_t)(union perf_event *event,
+typedef int (*perf_event__handler_t)(struct perf_tool *tool,
+                                    union perf_event *event,
                                     struct perf_sample *sample,
-                                     struct perf_session *session);
+                                    struct machine *machine);
 
-int perf_event__synthesize_thread_map(struct thread_map *threads,
+int perf_event__synthesize_thread_map(struct perf_tool *tool,
+                                     struct thread_map *threads,
                                      perf_event__handler_t process,
-                                     struct perf_session *session);
-int perf_event__synthesize_threads(perf_event__handler_t process,
-                                  struct perf_session *session);
-int perf_event__synthesize_kernel_mmap(perf_event__handler_t process,
-                                      struct perf_session *session,
+                                     struct machine *machine);
+int perf_event__synthesize_threads(struct perf_tool *tool,
+                                  perf_event__handler_t process,
+                                  struct machine *machine);
+int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
+                                      perf_event__handler_t process,
                                       struct machine *machine,
                                       const char *symbol_name);
 
-int perf_event__synthesize_modules(perf_event__handler_t process,
-                                  struct perf_session *session,
+int perf_event__synthesize_modules(struct perf_tool *tool,
+                                  perf_event__handler_t process,
                                   struct machine *machine);
 
-int perf_event__process_comm(union perf_event *event, struct perf_sample *sample,
-                            struct perf_session *session);
-int perf_event__process_lost(union perf_event *event, struct perf_sample *sample,
-                            struct perf_session *session);
-int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample,
-                            struct perf_session *session);
-int perf_event__process_task(union perf_event *event, struct perf_sample *sample,
-                            struct perf_session *session);
-int perf_event__process(union perf_event *event, struct perf_sample *sample,
-                       struct perf_session *session);
+int perf_event__process_comm(struct perf_tool *tool,
+                            union perf_event *event,
+                            struct perf_sample *sample,
+                            struct machine *machine);
+int perf_event__process_lost(struct perf_tool *tool,
+                            union perf_event *event,
+                            struct perf_sample *sample,
+                            struct machine *machine);
+int perf_event__process_mmap(struct perf_tool *tool,
+                            union perf_event *event,
+                            struct perf_sample *sample,
+                            struct machine *machine);
+int perf_event__process_task(struct perf_tool *tool,
+                            union perf_event *event,
+                            struct perf_sample *sample,
+                            struct machine *machine);
+int perf_event__process(struct perf_tool *tool,
+                       union perf_event *event,
+                       struct perf_sample *sample,
+                       struct machine *machine);
 
 struct addr_location;
 int perf_event__preprocess_sample(const union perf_event *self,
-                                 struct perf_session *session,
+                                 struct machine *machine,
                                  struct addr_location *al,
                                  struct perf_sample *sample,
                                  symbol_filter_t filter);
@@ -187,5 +199,13 @@ const char *perf_event__name(unsigned int id);
 int perf_event__parse_sample(const union perf_event *event, u64 type,
                             int sample_size, bool sample_id_all,
                             struct perf_sample *sample, bool swapped);
+int perf_event__synthesize_sample(union perf_event *event, u64 type,
+                                 const struct perf_sample *sample,
+                                 bool swapped);
+
+size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_task(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf(union perf_event *event, FILE *fp);
 
 #endif /* __PERF_RECORD_H */
index fbb4b4a..fa18370 100644 (file)
@@ -6,12 +6,16 @@
  *
  * Released under the GPL v2. (and only v2, not any later version)
  */
+#include "util.h"
+#include "debugfs.h"
 #include <poll.h>
 #include "cpumap.h"
 #include "thread_map.h"
 #include "evlist.h"
 #include "evsel.h"
-#include "util.h"
+#include <unistd.h>
+
+#include "parse-events.h"
 
 #include <sys/mman.h>
 
@@ -30,6 +34,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus,
                INIT_HLIST_HEAD(&evlist->heads[i]);
        INIT_LIST_HEAD(&evlist->entries);
        perf_evlist__set_maps(evlist, cpus, threads);
+       evlist->workload.pid = -1;
 }
 
 struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
@@ -43,6 +48,22 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
        return evlist;
 }
 
+void perf_evlist__config_attrs(struct perf_evlist *evlist,
+                              struct perf_record_opts *opts)
+{
+       struct perf_evsel *evsel;
+
+       if (evlist->cpus->map[0] < 0)
+               opts->no_inherit = true;
+
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               perf_evsel__config(evsel, opts);
+
+               if (evlist->nr_entries > 1)
+                       evsel->attr.sample_type |= PERF_SAMPLE_ID;
+       }
+}
+
 static void perf_evlist__purge(struct perf_evlist *evlist)
 {
        struct perf_evsel *pos, *n;
@@ -76,6 +97,14 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry)
        ++evlist->nr_entries;
 }
 
+static void perf_evlist__splice_list_tail(struct perf_evlist *evlist,
+                                         struct list_head *list,
+                                         int nr_entries)
+{
+       list_splice_tail(list, &evlist->entries);
+       evlist->nr_entries += nr_entries;
+}
+
 int perf_evlist__add_default(struct perf_evlist *evlist)
 {
        struct perf_event_attr attr = {
@@ -100,6 +129,126 @@ error:
        return -ENOMEM;
 }
 
+int perf_evlist__add_attrs(struct perf_evlist *evlist,
+                          struct perf_event_attr *attrs, size_t nr_attrs)
+{
+       struct perf_evsel *evsel, *n;
+       LIST_HEAD(head);
+       size_t i;
+
+       for (i = 0; i < nr_attrs; i++) {
+               evsel = perf_evsel__new(attrs + i, evlist->nr_entries + i);
+               if (evsel == NULL)
+                       goto out_delete_partial_list;
+               list_add_tail(&evsel->node, &head);
+       }
+
+       perf_evlist__splice_list_tail(evlist, &head, nr_attrs);
+
+       return 0;
+
+out_delete_partial_list:
+       list_for_each_entry_safe(evsel, n, &head, node)
+               perf_evsel__delete(evsel);
+       return -1;
+}
+
+static int trace_event__id(const char *evname)
+{
+       char *filename, *colon;
+       int err = -1, fd;
+
+       if (asprintf(&filename, "%s/%s/id", tracing_events_path, evname) < 0)
+               return -1;
+
+       colon = strrchr(filename, ':');
+       if (colon != NULL)
+               *colon = '/';
+
+       fd = open(filename, O_RDONLY);
+       if (fd >= 0) {
+               char id[16];
+               if (read(fd, id, sizeof(id)) > 0)
+                       err = atoi(id);
+               close(fd);
+       }
+
+       free(filename);
+       return err;
+}
+
+int perf_evlist__add_tracepoints(struct perf_evlist *evlist,
+                                const char *tracepoints[],
+                                size_t nr_tracepoints)
+{
+       int err;
+       size_t i;
+       struct perf_event_attr *attrs = zalloc(nr_tracepoints * sizeof(*attrs));
+
+       if (attrs == NULL)
+               return -1;
+
+       for (i = 0; i < nr_tracepoints; i++) {
+               err = trace_event__id(tracepoints[i]);
+
+               if (err < 0)
+                       goto out_free_attrs;
+
+               attrs[i].type          = PERF_TYPE_TRACEPOINT;
+               attrs[i].config        = err;
+               attrs[i].sample_type   = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME |
+                                         PERF_SAMPLE_CPU);
+               attrs[i].sample_period = 1;
+       }
+
+       err = perf_evlist__add_attrs(evlist, attrs, nr_tracepoints);
+out_free_attrs:
+       free(attrs);
+       return err;
+}
+
+static struct perf_evsel *
+       perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id)
+{
+       struct perf_evsel *evsel;
+
+       list_for_each_entry(evsel, &evlist->entries, node) {
+               if (evsel->attr.type   == PERF_TYPE_TRACEPOINT &&
+                   (int)evsel->attr.config == id)
+                       return evsel;
+       }
+
+       return NULL;
+}
+
+int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
+                                         const struct perf_evsel_str_handler *assocs,
+                                         size_t nr_assocs)
+{
+       struct perf_evsel *evsel;
+       int err;
+       size_t i;
+
+       for (i = 0; i < nr_assocs; i++) {
+               err = trace_event__id(assocs[i].name);
+               if (err < 0)
+                       goto out;
+
+               evsel = perf_evlist__find_tracepoint_by_id(evlist, err);
+               if (evsel == NULL)
+                       continue;
+
+               err = -EEXIST;
+               if (evsel->handler.func != NULL)
+                       goto out;
+               evsel->handler.func = assocs[i].handler;
+       }
+
+       err = 0;
+out:
+       return err;
+}
+
 void perf_evlist__disable(struct perf_evlist *evlist)
 {
        int cpu, thread;
@@ -126,7 +275,7 @@ void perf_evlist__enable(struct perf_evlist *evlist)
        }
 }
 
-int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
+static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist)
 {
        int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries;
        evlist->pollfd = malloc(sizeof(struct pollfd) * nfds);
@@ -282,7 +431,7 @@ void perf_evlist__munmap(struct perf_evlist *evlist)
        evlist->mmap = NULL;
 }
 
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
+static int perf_evlist__alloc_mmap(struct perf_evlist *evlist)
 {
        evlist->nr_mmaps = evlist->cpus->nr;
        if (evlist->cpus->map[0] == -1)
@@ -298,8 +447,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist,
        evlist->mmap[idx].mask = mask;
        evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot,
                                      MAP_SHARED, fd, 0);
-       if (evlist->mmap[idx].base == MAP_FAILED)
+       if (evlist->mmap[idx].base == MAP_FAILED) {
+               evlist->mmap[idx].base = NULL;
                return -1;
+       }
 
        perf_evlist__add_pollfd(evlist, fd);
        return 0;
@@ -400,14 +551,22 @@ out_unmap:
  *
  * Using perf_evlist__read_on_cpu does this automatically.
  */
-int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite)
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
+                     bool overwrite)
 {
        unsigned int page_size = sysconf(_SC_PAGE_SIZE);
-       int mask = pages * page_size - 1;
        struct perf_evsel *evsel;
        const struct cpu_map *cpus = evlist->cpus;
        const struct thread_map *threads = evlist->threads;
-       int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE);
+       int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), mask;
+
+        /* 512 kiB: default amount of unprivileged mlocked memory */
+        if (pages == UINT_MAX)
+                pages = (512 * 1024) / page_size;
+       else if (!is_power_of_2(pages))
+               return -EINVAL;
+
+       mask = pages * page_size - 1;
 
        if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0)
                return -ENOMEM;
@@ -512,6 +671,38 @@ u64 perf_evlist__sample_type(const struct perf_evlist *evlist)
        return first->attr.sample_type;
 }
 
+u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist)
+{
+       struct perf_evsel *first;
+       struct perf_sample *data;
+       u64 sample_type;
+       u16 size = 0;
+
+       first = list_entry(evlist->entries.next, struct perf_evsel, node);
+
+       if (!first->attr.sample_id_all)
+               goto out;
+
+       sample_type = first->attr.sample_type;
+
+       if (sample_type & PERF_SAMPLE_TID)
+               size += sizeof(data->tid) * 2;
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               size += sizeof(data->time);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               size += sizeof(data->id);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               size += sizeof(data->stream_id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               size += sizeof(data->cpu) * 2;
+out:
+       return size;
+}
+
 bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist)
 {
        struct perf_evsel *pos, *first;
@@ -569,3 +760,97 @@ out_err:
 
        return err;
 }
+
+int perf_evlist__prepare_workload(struct perf_evlist *evlist,
+                                 struct perf_record_opts *opts,
+                                 const char *argv[])
+{
+       int child_ready_pipe[2], go_pipe[2];
+       char bf;
+
+       if (pipe(child_ready_pipe) < 0) {
+               perror("failed to create 'ready' pipe");
+               return -1;
+       }
+
+       if (pipe(go_pipe) < 0) {
+               perror("failed to create 'go' pipe");
+               goto out_close_ready_pipe;
+       }
+
+       evlist->workload.pid = fork();
+       if (evlist->workload.pid < 0) {
+               perror("failed to fork");
+               goto out_close_pipes;
+       }
+
+       if (!evlist->workload.pid) {
+               if (opts->pipe_output)
+                       dup2(2, 1);
+
+               close(child_ready_pipe[0]);
+               close(go_pipe[1]);
+               fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
+
+               /*
+                * Do a dummy execvp to get the PLT entry resolved,
+                * so we avoid the resolver overhead on the real
+                * execvp call.
+                */
+               execvp("", (char **)argv);
+
+               /*
+                * Tell the parent we're ready to go
+                */
+               close(child_ready_pipe[1]);
+
+               /*
+                * Wait until the parent tells us to go.
+                */
+               if (read(go_pipe[0], &bf, 1) == -1)
+                       perror("unable to read pipe");
+
+               execvp(argv[0], (char **)argv);
+
+               perror(argv[0]);
+               kill(getppid(), SIGUSR1);
+               exit(-1);
+       }
+
+       if (!opts->system_wide && opts->target_tid == -1 && opts->target_pid == -1)
+               evlist->threads->map[0] = evlist->workload.pid;
+
+       close(child_ready_pipe[1]);
+       close(go_pipe[0]);
+       /*
+        * wait for child to settle
+        */
+       if (read(child_ready_pipe[0], &bf, 1) == -1) {
+               perror("unable to read pipe");
+               goto out_close_pipes;
+       }
+
+       evlist->workload.cork_fd = go_pipe[1];
+       close(child_ready_pipe[0]);
+       return 0;
+
+out_close_pipes:
+       close(go_pipe[0]);
+       close(go_pipe[1]);
+out_close_ready_pipe:
+       close(child_ready_pipe[0]);
+       close(child_ready_pipe[1]);
+       return -1;
+}
+
+int perf_evlist__start_workload(struct perf_evlist *evlist)
+{
+       if (evlist->workload.cork_fd > 0) {
+               /*
+                * Remove the cork, let it rip!
+                */
+               return close(evlist->workload.cork_fd);
+       }
+
+       return 0;
+}
index 1779ffe..8922aee 100644 (file)
@@ -2,12 +2,16 @@
 #define __PERF_EVLIST_H 1
 
 #include <linux/list.h>
+#include <stdio.h>
 #include "../perf.h"
 #include "event.h"
+#include "util.h"
+#include <unistd.h>
 
 struct pollfd;
 struct thread_map;
 struct cpu_map;
+struct perf_record_opts;
 
 #define PERF_EVLIST__HLIST_BITS 8
 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS)
@@ -19,6 +23,10 @@ struct perf_evlist {
        int              nr_fds;
        int              nr_mmaps;
        int              mmap_len;
+       struct {
+               int     cork_fd;
+               pid_t   pid;
+       } workload;
        bool             overwrite;
        union perf_event event_copy;
        struct perf_mmap *mmap;
@@ -28,6 +36,11 @@ struct perf_evlist {
        struct perf_evsel *selected;
 };
 
+struct perf_evsel_str_handler {
+       const char *name;
+       void       *handler;
+};
+
 struct perf_evsel;
 
 struct perf_evlist *perf_evlist__new(struct cpu_map *cpus,
@@ -39,11 +52,26 @@ void perf_evlist__delete(struct perf_evlist *evlist);
 
 void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry);
 int perf_evlist__add_default(struct perf_evlist *evlist);
+int perf_evlist__add_attrs(struct perf_evlist *evlist,
+                          struct perf_event_attr *attrs, size_t nr_attrs);
+int perf_evlist__add_tracepoints(struct perf_evlist *evlist,
+                                const char *tracepoints[], size_t nr_tracepoints);
+int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist,
+                                         const struct perf_evsel_str_handler *assocs,
+                                         size_t nr_assocs);
+
+#define perf_evlist__add_attrs_array(evlist, array) \
+       perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array))
+
+#define perf_evlist__add_tracepoints_array(evlist, array) \
+       perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array))
+
+#define perf_evlist__set_tracepoints_handlers_array(evlist, array) \
+       perf_evlist__set_tracepoints_handlers(evlist, array, ARRAY_SIZE(array))
 
 void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel,
                         int cpu, int thread, u64 id);
 
-int perf_evlist__alloc_pollfd(struct perf_evlist *evlist);
 void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd);
 
 struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id);
@@ -52,8 +80,16 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx);
 
 int perf_evlist__open(struct perf_evlist *evlist, bool group);
 
-int perf_evlist__alloc_mmap(struct perf_evlist *evlist);
-int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite);
+void perf_evlist__config_attrs(struct perf_evlist *evlist,
+                              struct perf_record_opts *opts);
+
+int perf_evlist__prepare_workload(struct perf_evlist *evlist,
+                                 struct perf_record_opts *opts,
+                                 const char *argv[]);
+int perf_evlist__start_workload(struct perf_evlist *evlist);
+
+int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages,
+                     bool overwrite);
 void perf_evlist__munmap(struct perf_evlist *evlist);
 
 void perf_evlist__disable(struct perf_evlist *evlist);
@@ -77,6 +113,7 @@ int perf_evlist__set_filters(struct perf_evlist *evlist);
 
 u64 perf_evlist__sample_type(const struct perf_evlist *evlist);
 bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist);
+u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist);
 
 bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist);
 bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist);
index d7915d4..667f3b7 100644 (file)
@@ -63,6 +63,79 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx)
        return evsel;
 }
 
+void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts)
+{
+       struct perf_event_attr *attr = &evsel->attr;
+       int track = !evsel->idx; /* only the first counter needs these */
+
+       attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0;
+       attr->inherit       = !opts->no_inherit;
+       attr->read_format   = PERF_FORMAT_TOTAL_TIME_ENABLED |
+                             PERF_FORMAT_TOTAL_TIME_RUNNING |
+                             PERF_FORMAT_ID;
+
+       attr->sample_type  |= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+
+       /*
+        * We default some events to a 1 default interval. But keep
+        * it a weak assumption overridable by the user.
+        */
+       if (!attr->sample_period || (opts->user_freq != UINT_MAX &&
+                                    opts->user_interval != ULLONG_MAX)) {
+               if (opts->freq) {
+                       attr->sample_type       |= PERF_SAMPLE_PERIOD;
+                       attr->freq              = 1;
+                       attr->sample_freq       = opts->freq;
+               } else {
+                       attr->sample_period = opts->default_interval;
+               }
+       }
+
+       if (opts->no_samples)
+               attr->sample_freq = 0;
+
+       if (opts->inherit_stat)
+               attr->inherit_stat = 1;
+
+       if (opts->sample_address) {
+               attr->sample_type       |= PERF_SAMPLE_ADDR;
+               attr->mmap_data = track;
+       }
+
+       if (opts->call_graph)
+               attr->sample_type       |= PERF_SAMPLE_CALLCHAIN;
+
+       if (opts->system_wide)
+               attr->sample_type       |= PERF_SAMPLE_CPU;
+
+       if (opts->period)
+               attr->sample_type       |= PERF_SAMPLE_PERIOD;
+
+       if (opts->sample_id_all_avail &&
+           (opts->sample_time || opts->system_wide ||
+            !opts->no_inherit || opts->cpu_list))
+               attr->sample_type       |= PERF_SAMPLE_TIME;
+
+       if (opts->raw_samples) {
+               attr->sample_type       |= PERF_SAMPLE_TIME;
+               attr->sample_type       |= PERF_SAMPLE_RAW;
+               attr->sample_type       |= PERF_SAMPLE_CPU;
+       }
+
+       if (opts->no_delay) {
+               attr->watermark = 0;
+               attr->wakeup_events = 1;
+       }
+
+       attr->mmap = track;
+       attr->comm = track;
+
+       if (opts->target_pid == -1 && opts->target_tid == -1 && !opts->system_wide) {
+               attr->disabled = 1;
+               attr->enable_on_exec = 1;
+       }
+}
+
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
 {
        int cpu, thread;
@@ -387,7 +460,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
                u32 val32[2];
        } u;
 
-
+       memset(data, 0, sizeof(*data));
        data->cpu = data->pid = data->tid = -1;
        data->stream_id = data->id = data->time = -1ULL;
 
@@ -504,3 +577,82 @@ int perf_event__parse_sample(const union perf_event *event, u64 type,
 
        return 0;
 }
+
+int perf_event__synthesize_sample(union perf_event *event, u64 type,
+                                 const struct perf_sample *sample,
+                                 bool swapped)
+{
+       u64 *array;
+
+       /*
+        * used for cross-endian analysis. See git commit 65014ab3
+        * for why this goofiness is needed.
+        */
+       union {
+               u64 val64;
+               u32 val32[2];
+       } u;
+
+       array = event->sample.array;
+
+       if (type & PERF_SAMPLE_IP) {
+               event->ip.ip = sample->ip;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_TID) {
+               u.val32[0] = sample->pid;
+               u.val32[1] = sample->tid;
+               if (swapped) {
+                       /*
+                        * Inverse of what is done in perf_event__parse_sample
+                        */
+                       u.val32[0] = bswap_32(u.val32[0]);
+                       u.val32[1] = bswap_32(u.val32[1]);
+                       u.val64 = bswap_64(u.val64);
+               }
+
+               *array = u.val64;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_TIME) {
+               *array = sample->time;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_ADDR) {
+               *array = sample->addr;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_ID) {
+               *array = sample->id;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_STREAM_ID) {
+               *array = sample->stream_id;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_CPU) {
+               u.val32[0] = sample->cpu;
+               if (swapped) {
+                       /*
+                        * Inverse of what is done in perf_event__parse_sample
+                        */
+                       u.val32[0] = bswap_32(u.val32[0]);
+                       u.val64 = bswap_64(u.val64);
+               }
+               *array = u.val64;
+               array++;
+       }
+
+       if (type & PERF_SAMPLE_PERIOD) {
+               *array = sample->period;
+               array++;
+       }
+
+       return 0;
+}
index b1d15e6..326b8e4 100644 (file)
@@ -61,12 +61,17 @@ struct perf_evsel {
                off_t           id_offset;
        };
        struct cgroup_sel       *cgrp;
+       struct {
+               void            *func;
+               void            *data;
+       } handler;
        bool                    supported;
 };
 
 struct cpu_map;
 struct thread_map;
 struct perf_evlist;
+struct perf_record_opts;
 
 struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx);
 void perf_evsel__init(struct perf_evsel *evsel,
@@ -74,6 +79,9 @@ void perf_evsel__init(struct perf_evsel *evsel,
 void perf_evsel__exit(struct perf_evsel *evsel);
 void perf_evsel__delete(struct perf_evsel *evsel);
 
+void perf_evsel__config(struct perf_evsel *evsel,
+                       struct perf_record_opts *opts);
+
 int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads);
 int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
index 33c17a2..3e7e0b0 100644 (file)
@@ -8,6 +8,7 @@
 #include <stdlib.h>
 #include <linux/list.h>
 #include <linux/kernel.h>
+#include <linux/bitops.h>
 #include <sys/utsname.h>
 
 #include "evlist.h"
@@ -28,9 +29,6 @@ static struct perf_trace_event_type *events;
 static u32 header_argc;
 static const char **header_argv;
 
-static int dsos__write_buildid_table(struct perf_header *header, int fd);
-static int perf_session__cache_build_ids(struct perf_session *session);
-
 int perf_header__push_event(u64 id, const char *name)
 {
        if (strlen(name) > MAX_EVENT_NAME)
@@ -187,6 +185,252 @@ perf_header__set_cmdline(int argc, const char **argv)
        return 0;
 }
 
+#define dsos__for_each_with_build_id(pos, head)        \
+       list_for_each_entry(pos, head, node)    \
+               if (!pos->has_build_id)         \
+                       continue;               \
+               else
+
+static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
+                               u16 misc, int fd)
+{
+       struct dso *pos;
+
+       dsos__for_each_with_build_id(pos, head) {
+               int err;
+               struct build_id_event b;
+               size_t len;
+
+               if (!pos->hit)
+                       continue;
+               len = pos->long_name_len + 1;
+               len = ALIGN(len, NAME_ALIGN);
+               memset(&b, 0, sizeof(b));
+               memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
+               b.pid = pid;
+               b.header.misc = misc;
+               b.header.size = sizeof(b) + len;
+               err = do_write(fd, &b, sizeof(b));
+               if (err < 0)
+                       return err;
+               err = write_padded(fd, pos->long_name,
+                                  pos->long_name_len + 1, len);
+               if (err < 0)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int machine__write_buildid_table(struct machine *machine, int fd)
+{
+       int err;
+       u16 kmisc = PERF_RECORD_MISC_KERNEL,
+           umisc = PERF_RECORD_MISC_USER;
+
+       if (!machine__is_host(machine)) {
+               kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
+               umisc = PERF_RECORD_MISC_GUEST_USER;
+       }
+
+       err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
+                                         kmisc, fd);
+       if (err == 0)
+               err = __dsos__write_buildid_table(&machine->user_dsos,
+                                                 machine->pid, umisc, fd);
+       return err;
+}
+
+static int dsos__write_buildid_table(struct perf_header *header, int fd)
+{
+       struct perf_session *session = container_of(header,
+                       struct perf_session, header);
+       struct rb_node *nd;
+       int err = machine__write_buildid_table(&session->host_machine, fd);
+
+       if (err)
+               return err;
+
+       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+               struct machine *pos = rb_entry(nd, struct machine, rb_node);
+               err = machine__write_buildid_table(pos, fd);
+               if (err)
+                       break;
+       }
+       return err;
+}
+
+int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
+                         const char *name, bool is_kallsyms)
+{
+       const size_t size = PATH_MAX;
+       char *realname, *filename = zalloc(size),
+            *linkname = zalloc(size), *targetname;
+       int len, err = -1;
+
+       if (is_kallsyms) {
+               if (symbol_conf.kptr_restrict) {
+                       pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
+                       return 0;
+               }
+               realname = (char *)name;
+       } else
+               realname = realpath(name, NULL);
+
+       if (realname == NULL || filename == NULL || linkname == NULL)
+               goto out_free;
+
+       len = snprintf(filename, size, "%s%s%s",
+                      debugdir, is_kallsyms ? "/" : "", realname);
+       if (mkdir_p(filename, 0755))
+               goto out_free;
+
+       snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id);
+
+       if (access(filename, F_OK)) {
+               if (is_kallsyms) {
+                        if (copyfile("/proc/kallsyms", filename))
+                               goto out_free;
+               } else if (link(realname, filename) && copyfile(name, filename))
+                       goto out_free;
+       }
+
+       len = snprintf(linkname, size, "%s/.build-id/%.2s",
+                      debugdir, sbuild_id);
+
+       if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
+               goto out_free;
+
+       snprintf(linkname + len, size - len, "/%s", sbuild_id + 2);
+       targetname = filename + strlen(debugdir) - 5;
+       memcpy(targetname, "../..", 5);
+
+       if (symlink(targetname, linkname) == 0)
+               err = 0;
+out_free:
+       if (!is_kallsyms)
+               free(realname);
+       free(filename);
+       free(linkname);
+       return err;
+}
+
+static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
+                                const char *name, const char *debugdir,
+                                bool is_kallsyms)
+{
+       char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+       build_id__sprintf(build_id, build_id_size, sbuild_id);
+
+       return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms);
+}
+
+int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
+{
+       const size_t size = PATH_MAX;
+       char *filename = zalloc(size),
+            *linkname = zalloc(size);
+       int err = -1;
+
+       if (filename == NULL || linkname == NULL)
+               goto out_free;
+
+       snprintf(linkname, size, "%s/.build-id/%.2s/%s",
+                debugdir, sbuild_id, sbuild_id + 2);
+
+       if (access(linkname, F_OK))
+               goto out_free;
+
+       if (readlink(linkname, filename, size - 1) < 0)
+               goto out_free;
+
+       if (unlink(linkname))
+               goto out_free;
+
+       /*
+        * Since the link is relative, we must make it absolute:
+        */
+       snprintf(linkname, size, "%s/.build-id/%.2s/%s",
+                debugdir, sbuild_id, filename);
+
+       if (unlink(linkname))
+               goto out_free;
+
+       err = 0;
+out_free:
+       free(filename);
+       free(linkname);
+       return err;
+}
+
+static int dso__cache_build_id(struct dso *dso, const char *debugdir)
+{
+       bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
+
+       return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
+                                    dso->long_name, debugdir, is_kallsyms);
+}
+
+static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
+{
+       struct dso *pos;
+       int err = 0;
+
+       dsos__for_each_with_build_id(pos, head)
+               if (dso__cache_build_id(pos, debugdir))
+                       err = -1;
+
+       return err;
+}
+
+static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
+{
+       int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
+       ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
+       return ret;
+}
+
+static int perf_session__cache_build_ids(struct perf_session *session)
+{
+       struct rb_node *nd;
+       int ret;
+       char debugdir[PATH_MAX];
+
+       snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir);
+
+       if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
+               return -1;
+
+       ret = machine__cache_build_ids(&session->host_machine, debugdir);
+
+       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+               struct machine *pos = rb_entry(nd, struct machine, rb_node);
+               ret |= machine__cache_build_ids(pos, debugdir);
+       }
+       return ret ? -1 : 0;
+}
+
+static bool machine__read_build_ids(struct machine *machine, bool with_hits)
+{
+       bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
+       ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
+       return ret;
+}
+
+static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
+{
+       struct rb_node *nd;
+       bool ret = machine__read_build_ids(&session->host_machine, with_hits);
+
+       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+               struct machine *pos = rb_entry(nd, struct machine, rb_node);
+               ret |= machine__read_build_ids(pos, with_hits);
+       }
+
+       return ret;
+}
+
 static int write_trace_info(int fd, struct perf_header *h __used,
                            struct perf_evlist *evlist)
 {
@@ -202,6 +446,9 @@ static int write_build_id(int fd, struct perf_header *h,
 
        session = container_of(h, struct perf_session, header);
 
+       if (!perf_session__read_build_ids(session, true))
+               return -1;
+
        err = dsos__write_buildid_table(h, fd);
        if (err < 0) {
                pr_debug("failed to write buildid table\n");
@@ -1065,26 +1312,30 @@ struct feature_ops {
        bool full_only;
 };
 
-#define FEAT_OPA(n, w, p) \
-       [n] = { .name = #n, .write = w, .print = p }
-#define FEAT_OPF(n, w, p) \
-       [n] = { .name = #n, .write = w, .print = p, .full_only = true }
+#define FEAT_OPA(n, func) \
+       [n] = { .name = #n, .write = write_##func, .print = print_##func }
+#define FEAT_OPF(n, func) \
+       [n] = { .name = #n, .write = write_##func, .print = print_##func, .full_only = true }
+
+/* feature_ops not implemented: */
+#define print_trace_info               NULL
+#define print_build_id                 NULL
 
 static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
-       FEAT_OPA(HEADER_TRACE_INFO, write_trace_info, NULL),
-       FEAT_OPA(HEADER_BUILD_ID, write_build_id, NULL),
-       FEAT_OPA(HEADER_HOSTNAME, write_hostname, print_hostname),
-       FEAT_OPA(HEADER_OSRELEASE, write_osrelease, print_osrelease),
-       FEAT_OPA(HEADER_VERSION, write_version, print_version),
-       FEAT_OPA(HEADER_ARCH, write_arch, print_arch),
-       FEAT_OPA(HEADER_NRCPUS, write_nrcpus, print_nrcpus),
-       FEAT_OPA(HEADER_CPUDESC, write_cpudesc, print_cpudesc),
-       FEAT_OPA(HEADER_CPUID, write_cpuid, print_cpuid),
-       FEAT_OPA(HEADER_TOTAL_MEM, write_total_mem, print_total_mem),
-       FEAT_OPA(HEADER_EVENT_DESC, write_event_desc, print_event_desc),
-       FEAT_OPA(HEADER_CMDLINE, write_cmdline, print_cmdline),
-       FEAT_OPF(HEADER_CPU_TOPOLOGY, write_cpu_topology, print_cpu_topology),
-       FEAT_OPF(HEADER_NUMA_TOPOLOGY, write_numa_topology, print_numa_topology),
+       FEAT_OPA(HEADER_TRACE_INFO,     trace_info),
+       FEAT_OPA(HEADER_BUILD_ID,       build_id),
+       FEAT_OPA(HEADER_HOSTNAME,       hostname),
+       FEAT_OPA(HEADER_OSRELEASE,      osrelease),
+       FEAT_OPA(HEADER_VERSION,        version),
+       FEAT_OPA(HEADER_ARCH,           arch),
+       FEAT_OPA(HEADER_NRCPUS,         nrcpus),
+       FEAT_OPA(HEADER_CPUDESC,        cpudesc),
+       FEAT_OPA(HEADER_CPUID,          cpuid),
+       FEAT_OPA(HEADER_TOTAL_MEM,      total_mem),
+       FEAT_OPA(HEADER_EVENT_DESC,     event_desc),
+       FEAT_OPA(HEADER_CMDLINE,        cmdline),
+       FEAT_OPF(HEADER_CPU_TOPOLOGY,   cpu_topology),
+       FEAT_OPF(HEADER_NUMA_TOPOLOGY,  numa_topology),
 };
 
 struct header_print_data {
@@ -1103,9 +1354,9 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section,
                                "%d, continuing...\n", section->offset, feat);
                return 0;
        }
-       if (feat < HEADER_TRACE_INFO || feat >= HEADER_LAST_FEATURE) {
+       if (feat >= HEADER_LAST_FEATURE) {
                pr_warning("unknown feature %d\n", feat);
-               return -1;
+               return 0;
        }
        if (!feat_ops[feat].print)
                return 0;
@@ -1132,252 +1383,6 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
        return 0;
 }
 
-#define dsos__for_each_with_build_id(pos, head)        \
-       list_for_each_entry(pos, head, node)    \
-               if (!pos->has_build_id)         \
-                       continue;               \
-               else
-
-static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
-                               u16 misc, int fd)
-{
-       struct dso *pos;
-
-       dsos__for_each_with_build_id(pos, head) {
-               int err;
-               struct build_id_event b;
-               size_t len;
-
-               if (!pos->hit)
-                       continue;
-               len = pos->long_name_len + 1;
-               len = ALIGN(len, NAME_ALIGN);
-               memset(&b, 0, sizeof(b));
-               memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
-               b.pid = pid;
-               b.header.misc = misc;
-               b.header.size = sizeof(b) + len;
-               err = do_write(fd, &b, sizeof(b));
-               if (err < 0)
-                       return err;
-               err = write_padded(fd, pos->long_name,
-                                  pos->long_name_len + 1, len);
-               if (err < 0)
-                       return err;
-       }
-
-       return 0;
-}
-
-static int machine__write_buildid_table(struct machine *machine, int fd)
-{
-       int err;
-       u16 kmisc = PERF_RECORD_MISC_KERNEL,
-           umisc = PERF_RECORD_MISC_USER;
-
-       if (!machine__is_host(machine)) {
-               kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
-               umisc = PERF_RECORD_MISC_GUEST_USER;
-       }
-
-       err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid,
-                                         kmisc, fd);
-       if (err == 0)
-               err = __dsos__write_buildid_table(&machine->user_dsos,
-                                                 machine->pid, umisc, fd);
-       return err;
-}
-
-static int dsos__write_buildid_table(struct perf_header *header, int fd)
-{
-       struct perf_session *session = container_of(header,
-                       struct perf_session, header);
-       struct rb_node *nd;
-       int err = machine__write_buildid_table(&session->host_machine, fd);
-
-       if (err)
-               return err;
-
-       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
-               struct machine *pos = rb_entry(nd, struct machine, rb_node);
-               err = machine__write_buildid_table(pos, fd);
-               if (err)
-                       break;
-       }
-       return err;
-}
-
-int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
-                         const char *name, bool is_kallsyms)
-{
-       const size_t size = PATH_MAX;
-       char *realname, *filename = zalloc(size),
-            *linkname = zalloc(size), *targetname;
-       int len, err = -1;
-
-       if (is_kallsyms) {
-               if (symbol_conf.kptr_restrict) {
-                       pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n");
-                       return 0;
-               }
-               realname = (char *)name;
-       } else
-               realname = realpath(name, NULL);
-
-       if (realname == NULL || filename == NULL || linkname == NULL)
-               goto out_free;
-
-       len = snprintf(filename, size, "%s%s%s",
-                      debugdir, is_kallsyms ? "/" : "", realname);
-       if (mkdir_p(filename, 0755))
-               goto out_free;
-
-       snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id);
-
-       if (access(filename, F_OK)) {
-               if (is_kallsyms) {
-                        if (copyfile("/proc/kallsyms", filename))
-                               goto out_free;
-               } else if (link(realname, filename) && copyfile(name, filename))
-                       goto out_free;
-       }
-
-       len = snprintf(linkname, size, "%s/.build-id/%.2s",
-                      debugdir, sbuild_id);
-
-       if (access(linkname, X_OK) && mkdir_p(linkname, 0755))
-               goto out_free;
-
-       snprintf(linkname + len, size - len, "/%s", sbuild_id + 2);
-       targetname = filename + strlen(debugdir) - 5;
-       memcpy(targetname, "../..", 5);
-
-       if (symlink(targetname, linkname) == 0)
-               err = 0;
-out_free:
-       if (!is_kallsyms)
-               free(realname);
-       free(filename);
-       free(linkname);
-       return err;
-}
-
-static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size,
-                                const char *name, const char *debugdir,
-                                bool is_kallsyms)
-{
-       char sbuild_id[BUILD_ID_SIZE * 2 + 1];
-
-       build_id__sprintf(build_id, build_id_size, sbuild_id);
-
-       return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms);
-}
-
-int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir)
-{
-       const size_t size = PATH_MAX;
-       char *filename = zalloc(size),
-            *linkname = zalloc(size);
-       int err = -1;
-
-       if (filename == NULL || linkname == NULL)
-               goto out_free;
-
-       snprintf(linkname, size, "%s/.build-id/%.2s/%s",
-                debugdir, sbuild_id, sbuild_id + 2);
-
-       if (access(linkname, F_OK))
-               goto out_free;
-
-       if (readlink(linkname, filename, size - 1) < 0)
-               goto out_free;
-
-       if (unlink(linkname))
-               goto out_free;
-
-       /*
-        * Since the link is relative, we must make it absolute:
-        */
-       snprintf(linkname, size, "%s/.build-id/%.2s/%s",
-                debugdir, sbuild_id, filename);
-
-       if (unlink(linkname))
-               goto out_free;
-
-       err = 0;
-out_free:
-       free(filename);
-       free(linkname);
-       return err;
-}
-
-static int dso__cache_build_id(struct dso *dso, const char *debugdir)
-{
-       bool is_kallsyms = dso->kernel && dso->long_name[0] != '/';
-
-       return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id),
-                                    dso->long_name, debugdir, is_kallsyms);
-}
-
-static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
-{
-       struct dso *pos;
-       int err = 0;
-
-       dsos__for_each_with_build_id(pos, head)
-               if (dso__cache_build_id(pos, debugdir))
-                       err = -1;
-
-       return err;
-}
-
-static int machine__cache_build_ids(struct machine *machine, const char *debugdir)
-{
-       int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir);
-       ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir);
-       return ret;
-}
-
-static int perf_session__cache_build_ids(struct perf_session *session)
-{
-       struct rb_node *nd;
-       int ret;
-       char debugdir[PATH_MAX];
-
-       snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir);
-
-       if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
-               return -1;
-
-       ret = machine__cache_build_ids(&session->host_machine, debugdir);
-
-       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
-               struct machine *pos = rb_entry(nd, struct machine, rb_node);
-               ret |= machine__cache_build_ids(pos, debugdir);
-       }
-       return ret ? -1 : 0;
-}
-
-static bool machine__read_build_ids(struct machine *machine, bool with_hits)
-{
-       bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits);
-       ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits);
-       return ret;
-}
-
-static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits)
-{
-       struct rb_node *nd;
-       bool ret = machine__read_build_ids(&session->host_machine, with_hits);
-
-       for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
-               struct machine *pos = rb_entry(nd, struct machine, rb_node);
-               ret |= machine__read_build_ids(pos, with_hits);
-       }
-
-       return ret;
-}
-
 static int do_write_feat(int fd, struct perf_header *h, int type,
                         struct perf_file_section **p,
                         struct perf_evlist *evlist)
@@ -1386,6 +1391,8 @@ static int do_write_feat(int fd, struct perf_header *h, int type,
        int ret = 0;
 
        if (perf_header__has_feat(h, type)) {
+               if (!feat_ops[type].write)
+                       return -1;
 
                (*p)->offset = lseek(fd, 0, SEEK_CUR);
 
@@ -1408,18 +1415,12 @@ static int perf_header__adds_write(struct perf_header *header,
                                   struct perf_evlist *evlist, int fd)
 {
        int nr_sections;
-       struct perf_session *session;
        struct perf_file_section *feat_sec, *p;
        int sec_size;
        u64 sec_start;
+       int feat;
        int err;
 
-       session = container_of(header, struct perf_session, header);
-
-       if (perf_header__has_feat(header, HEADER_BUILD_ID &&
-           !perf_session__read_build_ids(session, true)))
-               perf_header__clear_feat(header, HEADER_BUILD_ID);
-
        nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
        if (!nr_sections)
                return 0;
@@ -1433,64 +1434,11 @@ static int perf_header__adds_write(struct perf_header *header,
        sec_start = header->data_offset + header->data_size;
        lseek(fd, sec_start + sec_size, SEEK_SET);
 
-       err = do_write_feat(fd, header, HEADER_TRACE_INFO, &p, evlist);
-       if (err)
-               goto out_free;
-
-       err = do_write_feat(fd, header, HEADER_BUILD_ID, &p, evlist);
-       if (err) {
-               perf_header__clear_feat(header, HEADER_BUILD_ID);
-               goto out_free;
+       for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) {
+               if (do_write_feat(fd, header, feat, &p, evlist))
+                       perf_header__clear_feat(header, feat);
        }
 
-       err = do_write_feat(fd, header, HEADER_HOSTNAME, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_HOSTNAME);
-
-       err = do_write_feat(fd, header, HEADER_OSRELEASE, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_OSRELEASE);
-
-       err = do_write_feat(fd, header, HEADER_VERSION, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_VERSION);
-
-       err = do_write_feat(fd, header, HEADER_ARCH, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_ARCH);
-
-       err = do_write_feat(fd, header, HEADER_NRCPUS, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_NRCPUS);
-
-       err = do_write_feat(fd, header, HEADER_CPUDESC, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_CPUDESC);
-
-       err = do_write_feat(fd, header, HEADER_CPUID, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_CPUID);
-
-       err = do_write_feat(fd, header, HEADER_TOTAL_MEM, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_TOTAL_MEM);
-
-       err = do_write_feat(fd, header, HEADER_CMDLINE, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_CMDLINE);
-
-       err = do_write_feat(fd, header, HEADER_EVENT_DESC, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_EVENT_DESC);
-
-       err = do_write_feat(fd, header, HEADER_CPU_TOPOLOGY, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_CPU_TOPOLOGY);
-
-       err = do_write_feat(fd, header, HEADER_NUMA_TOPOLOGY, &p, evlist);
-       if (err)
-               perf_header__clear_feat(header, HEADER_NUMA_TOPOLOGY);
-
        lseek(fd, sec_start, SEEK_SET);
        /*
         * may write more than needed due to dropped feature, but
@@ -1499,7 +1447,6 @@ static int perf_header__adds_write(struct perf_header *header,
        err = do_write(fd, feat_sec, sec_size);
        if (err < 0)
                pr_debug("failed to write feature section\n");
-out_free:
        free(feat_sec);
        return err;
 }
@@ -1637,20 +1584,20 @@ static int perf_header__getbuffer64(struct perf_header *header,
 int perf_header__process_sections(struct perf_header *header, int fd,
                                  void *data,
                                  int (*process)(struct perf_file_section *section,
-                                 struct perf_header *ph,
-                                 int feat, int fd, void *data))
+                                                struct perf_header *ph,
+                                                int feat, int fd, void *data))
 {
-       struct perf_file_section *feat_sec;
+       struct perf_file_section *feat_sec, *sec;
        int nr_sections;
        int sec_size;
-       int idx = 0;
-       int err = -1, feat = 1;
+       int feat;
+       int err;
 
        nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS);
        if (!nr_sections)
                return 0;
 
-       feat_sec = calloc(sizeof(*feat_sec), nr_sections);
+       feat_sec = sec = calloc(sizeof(*feat_sec), nr_sections);
        if (!feat_sec)
                return -1;
 
@@ -1658,20 +1605,16 @@ int perf_header__process_sections(struct perf_header *header, int fd,
 
        lseek(fd, header->data_offset + header->data_size, SEEK_SET);
 
-       if (perf_header__getbuffer64(header, fd, feat_sec, sec_size))
+       err = perf_header__getbuffer64(header, fd, feat_sec, sec_size);
+       if (err < 0)
                goto out_free;
 
-       err = 0;
-       while (idx < nr_sections && feat < HEADER_LAST_FEATURE) {
-               if (perf_header__has_feat(header, feat)) {
-                       struct perf_file_section *sec = &feat_sec[idx++];
-
-                       err = process(sec, header, feat, fd, data);
-                       if (err < 0)
-                               break;
-               }
-               ++feat;
+       for_each_set_bit(feat, header->adds_features, HEADER_LAST_FEATURE) {
+               err = process(sec++, header, feat, fd, data);
+               if (err < 0)
+                       goto out_free;
        }
+       err = 0;
 out_free:
        free(feat_sec);
        return err;
@@ -1906,32 +1849,21 @@ static int perf_file_section__process(struct perf_file_section *section,
                return 0;
        }
 
+       if (feat >= HEADER_LAST_FEATURE) {
+               pr_debug("unknown feature %d, continuing...\n", feat);
+               return 0;
+       }
+
        switch (feat) {
        case HEADER_TRACE_INFO:
                trace_report(fd, false);
                break;
-
        case HEADER_BUILD_ID:
                if (perf_header__read_build_ids(ph, fd, section->offset, section->size))
                        pr_debug("Failed to read buildids, continuing...\n");
                break;
-
-       case HEADER_HOSTNAME:
-       case HEADER_OSRELEASE:
-       case HEADER_VERSION:
-       case HEADER_ARCH:
-       case HEADER_NRCPUS:
-       case HEADER_CPUDESC:
-       case HEADER_CPUID:
-       case HEADER_TOTAL_MEM:
-       case HEADER_CMDLINE:
-       case HEADER_EVENT_DESC:
-       case HEADER_CPU_TOPOLOGY:
-       case HEADER_NUMA_TOPOLOGY:
-               break;
-
        default:
-               pr_debug("unknown feature %d, continuing...\n", feat);
+               break;
        }
 
        return 0;
@@ -2041,6 +1973,8 @@ int perf_session__read_header(struct perf_session *session, int fd)
                lseek(fd, tmp, SEEK_SET);
        }
 
+       symbol_conf.nr_events = nr_attrs;
+
        if (f_header.event_types.size) {
                lseek(fd, f_header.event_types.offset, SEEK_SET);
                events = malloc(f_header.event_types.size);
@@ -2068,9 +2002,9 @@ out_delete_evlist:
        return -ENOMEM;
 }
 
-int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
-                               perf_event__handler_t process,
-                               struct perf_session *session)
+int perf_event__synthesize_attr(struct perf_tool *tool,
+                               struct perf_event_attr *attr, u16 ids, u64 *id,
+                               perf_event__handler_t process)
 {
        union perf_event *ev;
        size_t size;
@@ -2092,22 +2026,23 @@ int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
        ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
        ev->attr.header.size = size;
 
-       err = process(ev, NULL, session);
+       err = process(tool, ev, NULL, NULL);
 
        free(ev);
 
        return err;
 }
 
-int perf_session__synthesize_attrs(struct perf_session *session,
+int perf_event__synthesize_attrs(struct perf_tool *tool,
+                                  struct perf_session *session,
                                   perf_event__handler_t process)
 {
        struct perf_evsel *attr;
        int err = 0;
 
        list_for_each_entry(attr, &session->evlist->entries, node) {
-               err = perf_event__synthesize_attr(&attr->attr, attr->ids,
-                                                 attr->id, process, session);
+               err = perf_event__synthesize_attr(tool, &attr->attr, attr->ids,
+                                                 attr->id, process);
                if (err) {
                        pr_debug("failed to create perf header attribute\n");
                        return err;
@@ -2118,23 +2053,23 @@ int perf_session__synthesize_attrs(struct perf_session *session,
 }
 
 int perf_event__process_attr(union perf_event *event,
-                            struct perf_session *session)
+                            struct perf_evlist **pevlist)
 {
        unsigned int i, ids, n_ids;
        struct perf_evsel *evsel;
+       struct perf_evlist *evlist = *pevlist;
 
-       if (session->evlist == NULL) {
-               session->evlist = perf_evlist__new(NULL, NULL);
-               if (session->evlist == NULL)
+       if (evlist == NULL) {
+               *pevlist = evlist = perf_evlist__new(NULL, NULL);
+               if (evlist == NULL)
                        return -ENOMEM;
        }
 
-       evsel = perf_evsel__new(&event->attr.attr,
-                               session->evlist->nr_entries);
+       evsel = perf_evsel__new(&event->attr.attr, evlist->nr_entries);
        if (evsel == NULL)
                return -ENOMEM;
 
-       perf_evlist__add(session->evlist, evsel);
+       perf_evlist__add(evlist, evsel);
 
        ids = event->header.size;
        ids -= (void *)&event->attr.id - (void *)event;
@@ -2148,18 +2083,16 @@ int perf_event__process_attr(union perf_event *event,
                return -ENOMEM;
 
        for (i = 0; i < n_ids; i++) {
-               perf_evlist__id_add(session->evlist, evsel, 0, i,
-                                   event->attr.id[i]);
+               perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]);
        }
 
-       perf_session__update_sample_type(session);
-
        return 0;
 }
 
-int perf_event__synthesize_event_type(u64 event_id, char *name,
+int perf_event__synthesize_event_type(struct perf_tool *tool,
+                                     u64 event_id, char *name,
                                      perf_event__handler_t process,
-                                     struct perf_session *session)
+                                     struct machine *machine)
 {
        union perf_event ev;
        size_t size = 0;
@@ -2177,13 +2110,14 @@ int perf_event__synthesize_event_type(u64 event_id, char *name,
        ev.event_type.header.size = sizeof(ev.event_type) -
                (sizeof(ev.event_type.event_type.name) - size);
 
-       err = process(&ev, NULL, session);
+       err = process(tool, &ev, NULL, machine);
 
        return err;
 }
 
-int perf_event__synthesize_event_types(perf_event__handler_t process,
-                                      struct perf_session *session)
+int perf_event__synthesize_event_types(struct perf_tool *tool,
+                                      perf_event__handler_t process,
+                                      struct machine *machine)
 {
        struct perf_trace_event_type *type;
        int i, err = 0;
@@ -2191,9 +2125,9 @@ int perf_event__synthesize_event_types(perf_event__handler_t process,
        for (i = 0; i < event_count; i++) {
                type = &events[i];
 
-               err = perf_event__synthesize_event_type(type->event_id,
+               err = perf_event__synthesize_event_type(tool, type->event_id,
                                                        type->name, process,
-                                                       session);
+                                                       machine);
                if (err) {
                        pr_debug("failed to create perf header event type\n");
                        return err;
@@ -2203,8 +2137,8 @@ int perf_event__synthesize_event_types(perf_event__handler_t process,
        return err;
 }
 
-int perf_event__process_event_type(union perf_event *event,
-                                  struct perf_session *session __unused)
+int perf_event__process_event_type(struct perf_tool *tool __unused,
+                                  union perf_event *event)
 {
        if (perf_header__push_event(event->event_type.event_type.event_id,
                                    event->event_type.event_type.name) < 0)
@@ -2213,9 +2147,9 @@ int perf_event__process_event_type(union perf_event *event,
        return 0;
 }
 
-int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
-                                        perf_event__handler_t process,
-                                  struct perf_session *session __unused)
+int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd,
+                                       struct perf_evlist *evlist,
+                                       perf_event__handler_t process)
 {
        union perf_event ev;
        struct tracing_data *tdata;
@@ -2246,7 +2180,7 @@ int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
        ev.tracing_data.header.size = sizeof(ev.tracing_data);
        ev.tracing_data.size = aligned_size;
 
-       process(&ev, NULL, session);
+       process(tool, &ev, NULL, NULL);
 
        /*
         * The put function will copy all the tracing data
@@ -2288,10 +2222,10 @@ int perf_event__process_tracing_data(union perf_event *event,
        return size_read + padding;
 }
 
-int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
+int perf_event__synthesize_build_id(struct perf_tool *tool,
+                                   struct dso *pos, u16 misc,
                                    perf_event__handler_t process,
-                                   struct machine *machine,
-                                   struct perf_session *session)
+                                   struct machine *machine)
 {
        union perf_event ev;
        size_t len;
@@ -2311,12 +2245,13 @@ int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
        ev.build_id.header.size = sizeof(ev.build_id) + len;
        memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
 
-       err = process(&ev, NULL, session);
+       err = process(tool, &ev, NULL, machine);
 
        return err;
 }
 
-int perf_event__process_build_id(union perf_event *event,
+int perf_event__process_build_id(struct perf_tool *tool __used,
+                                union perf_event *event,
                                 struct perf_session *session)
 {
        __event_process_build_id(&event->build_id,
index 3d5a742..ac4ec95 100644 (file)
@@ -10,7 +10,8 @@
 #include <linux/bitmap.h>
 
 enum {
-       HEADER_TRACE_INFO = 1,
+       HEADER_RESERVED         = 0,    /* always cleared */
+       HEADER_TRACE_INFO       = 1,
        HEADER_BUILD_ID,
 
        HEADER_HOSTNAME,
@@ -27,10 +28,9 @@ enum {
        HEADER_NUMA_TOPOLOGY,
 
        HEADER_LAST_FEATURE,
+       HEADER_FEAT_BITS        = 256,
 };
 
-#define HEADER_FEAT_BITS                       256
-
 struct perf_file_section {
        u64 offset;
        u64 size;
@@ -68,6 +68,7 @@ struct perf_header {
 };
 
 struct perf_evlist;
+struct perf_session;
 
 int perf_session__read_header(struct perf_session *session, int fd);
 int perf_session__write_header(struct perf_session *session,
@@ -96,32 +97,36 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
                          const char *name, bool is_kallsyms);
 int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
 
-int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
-                               perf_event__handler_t process,
-                               struct perf_session *session);
-int perf_session__synthesize_attrs(struct perf_session *session,
-                                  perf_event__handler_t process);
-int perf_event__process_attr(union perf_event *event, struct perf_session *session);
+int perf_event__synthesize_attr(struct perf_tool *tool,
+                               struct perf_event_attr *attr, u16 ids, u64 *id,
+                               perf_event__handler_t process);
+int perf_event__synthesize_attrs(struct perf_tool *tool,
+                                struct perf_session *session,
+                                perf_event__handler_t process);
+int perf_event__process_attr(union perf_event *event, struct perf_evlist **pevlist);
 
-int perf_event__synthesize_event_type(u64 event_id, char *name,
+int perf_event__synthesize_event_type(struct perf_tool *tool,
+                                     u64 event_id, char *name,
                                      perf_event__handler_t process,
-                                     struct perf_session *session);
-int perf_event__synthesize_event_types(perf_event__handler_t process,
-                                      struct perf_session *session);
-int perf_event__process_event_type(union perf_event *event,
-                                  struct perf_session *session);
-
-int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist,
-                                       perf_event__handler_t process,
-                                       struct perf_session *session);
+                                     struct machine *machine);
+int perf_event__synthesize_event_types(struct perf_tool *tool,
+                                      perf_event__handler_t process,
+                                      struct machine *machine);
+int perf_event__process_event_type(struct perf_tool *tool,
+                                  union perf_event *event);
+
+int perf_event__synthesize_tracing_data(struct perf_tool *tool,
+                                       int fd, struct perf_evlist *evlist,
+                                       perf_event__handler_t process);
 int perf_event__process_tracing_data(union perf_event *event,
                                     struct perf_session *session);
 
-int perf_event__synthesize_build_id(struct dso *pos, u16 misc,
+int perf_event__synthesize_build_id(struct perf_tool *tool,
+                                   struct dso *pos, u16 misc,
                                    perf_event__handler_t process,
-                                   struct machine *machine,
-                                   struct perf_session *session);
-int perf_event__process_build_id(union perf_event *event,
+                                   struct machine *machine);
+int perf_event__process_build_id(struct perf_tool *tool,
+                                union perf_event *event,
                                 struct perf_session *session);
 
 /*
index 89289c8..ff6f9d5 100644 (file)
@@ -117,7 +117,6 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used,
 
 static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
                                           int evidx __used,
-                                          int nr_events __used,
                                           void(*timer)(void *arg) __used,
                                           void *arg __used,
                                           int delay_secs __used)
@@ -128,7 +127,7 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used,
 #define K_RIGHT -2
 #else
 #include "ui/keysyms.h"
-int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events,
+int hist_entry__tui_annotate(struct hist_entry *he, int evidx,
                             void(*timer)(void *arg), void *arg, int delay_secs);
 
 int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
index 305c848..62cdee7 100644 (file)
@@ -9,6 +9,17 @@
 #define BITS_PER_BYTE           8
 #define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 
+#define for_each_set_bit(bit, addr, size) \
+       for ((bit) = find_first_bit((addr), (size));            \
+            (bit) < (size);                                    \
+            (bit) = find_next_bit((addr), (size), (bit) + 1))
+
+/* same as for_each_set_bit() but use bit as value to start with */
+#define for_each_set_bit_cont(bit, addr, size) \
+       for ((bit) = find_next_bit((addr), (size), (bit));      \
+            (bit) < (size);                                    \
+            (bit) = find_next_bit((addr), (size), (bit) + 1))
+
 static inline void set_bit(int nr, unsigned long *addr)
 {
        addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
@@ -30,4 +41,111 @@ static inline unsigned long hweight_long(unsigned long w)
        return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
 
+#define BITOP_WORD(nr)         ((nr) / BITS_PER_LONG)
+
+/**
+ * __ffs - find first bit in word.
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static __always_inline unsigned long __ffs(unsigned long word)
+{
+       int num = 0;
+
+#if BITS_PER_LONG == 64
+       if ((word & 0xffffffff) == 0) {
+               num += 32;
+               word >>= 32;
+       }
+#endif
+       if ((word & 0xffff) == 0) {
+               num += 16;
+               word >>= 16;
+       }
+       if ((word & 0xff) == 0) {
+               num += 8;
+               word >>= 8;
+       }
+       if ((word & 0xf) == 0) {
+               num += 4;
+               word >>= 4;
+       }
+       if ((word & 0x3) == 0) {
+               num += 2;
+               word >>= 2;
+       }
+       if ((word & 0x1) == 0)
+               num += 1;
+       return num;
+}
+
+/*
+ * Find the first set bit in a memory region.
+ */
+static inline unsigned long
+find_first_bit(const unsigned long *addr, unsigned long size)
+{
+       const unsigned long *p = addr;
+       unsigned long result = 0;
+       unsigned long tmp;
+
+       while (size & ~(BITS_PER_LONG-1)) {
+               if ((tmp = *(p++)))
+                       goto found;
+               result += BITS_PER_LONG;
+               size -= BITS_PER_LONG;
+       }
+       if (!size)
+               return result;
+
+       tmp = (*p) & (~0UL >> (BITS_PER_LONG - size));
+       if (tmp == 0UL)         /* Are any bits set? */
+               return result + size;   /* Nope. */
+found:
+       return result + __ffs(tmp);
+}
+
+/*
+ * Find the next set bit in a memory region.
+ */
+static inline unsigned long
+find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset)
+{
+       const unsigned long *p = addr + BITOP_WORD(offset);
+       unsigned long result = offset & ~(BITS_PER_LONG-1);
+       unsigned long tmp;
+
+       if (offset >= size)
+               return size;
+       size -= result;
+       offset %= BITS_PER_LONG;
+       if (offset) {
+               tmp = *(p++);
+               tmp &= (~0UL << offset);
+               if (size < BITS_PER_LONG)
+                       goto found_first;
+               if (tmp)
+                       goto found_middle;
+               size -= BITS_PER_LONG;
+               result += BITS_PER_LONG;
+       }
+       while (size & ~(BITS_PER_LONG-1)) {
+               if ((tmp = *(p++)))
+                       goto found_middle;
+               result += BITS_PER_LONG;
+               size -= BITS_PER_LONG;
+       }
+       if (!size)
+               return result;
+       tmp = *p;
+
+found_first:
+       tmp &= (~0UL >> (BITS_PER_LONG - size));
+       if (tmp == 0UL)         /* Are any bits set? */
+               return result + size;   /* Nope. */
+found_middle:
+       return result + __ffs(tmp);
+}
+
 #endif
index 78284b1..316aa0a 100644 (file)
@@ -562,6 +562,10 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid)
        INIT_LIST_HEAD(&self->user_dsos);
        INIT_LIST_HEAD(&self->kernel_dsos);
 
+       self->threads = RB_ROOT;
+       INIT_LIST_HEAD(&self->dead_threads);
+       self->last_match = NULL;
+
        self->kmaps.machine = self;
        self->pid           = pid;
        self->root_dir      = strdup(root_dir);
index 890d855..2b8017f 100644 (file)
@@ -18,9 +18,11 @@ enum map_type {
 extern const char *map_type__name[MAP__NR_TYPES];
 
 struct dso;
+struct ip_callchain;
 struct ref_reloc_sym;
 struct map_groups;
 struct machine;
+struct perf_evsel;
 
 struct map {
        union {
@@ -61,7 +63,11 @@ struct map_groups {
 struct machine {
        struct rb_node    rb_node;
        pid_t             pid;
+       u16               id_hdr_size;
        char              *root_dir;
+       struct rb_root    threads;
+       struct list_head  dead_threads;
+       struct thread     *last_match;
        struct list_head  user_dsos;
        struct list_head  kernel_dsos;
        struct map_groups kmaps;
@@ -148,6 +154,13 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid);
 void machine__exit(struct machine *self);
 void machine__delete(struct machine *self);
 
+int machine__resolve_callchain(struct machine *machine,
+                              struct perf_evsel *evsel, struct thread *thread,
+                              struct ip_callchain *chain,
+                              struct symbol **parent);
+int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name,
+                                    u64 addr);
+
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
  * and --guestmodules
@@ -190,6 +203,12 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg,
                                               struct map **mapp,
                                               symbol_filter_t filter);
 
+
+struct thread *machine__findnew_thread(struct machine *machine, pid_t pid);
+void machine__remove_thread(struct machine *machine, struct thread *th);
+
+size_t machine__fprintf(struct machine *machine, FILE *fp);
+
 static inline
 struct symbol *machine__find_kernel_symbol(struct machine *self,
                                           enum map_type type, u64 addr,
index 928918b..531c283 100644 (file)
@@ -25,8 +25,6 @@ enum event_result {
        EVT_HANDLED_ALL
 };
 
-char debugfs_path[MAXPATHLEN];
-
 #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
 #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x
 
@@ -40,6 +38,7 @@ static struct event_symbol event_symbols[] = {
   { CHW(BRANCH_INSTRUCTIONS),          "branch-instructions",          "branches"              },
   { CHW(BRANCH_MISSES),                        "branch-misses",                ""                      },
   { CHW(BUS_CYCLES),                   "bus-cycles",                   ""                      },
+  { CHW(REF_CPU_CYCLES),               "ref-cycles",                   ""                      },
 
   { CSW(CPU_CLOCK),                    "cpu-clock",                    ""                      },
   { CSW(TASK_CLOCK),                   "task-clock",                   ""                      },
@@ -70,6 +69,7 @@ static const char *hw_event_names[PERF_COUNT_HW_MAX] = {
        "bus-cycles",
        "stalled-cycles-frontend",
        "stalled-cycles-backend",
+       "ref-cycles",
 };
 
 static const char *sw_event_names[PERF_COUNT_SW_MAX] = {
@@ -140,7 +140,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
        char evt_path[MAXPATHLEN];
        int fd;
 
-       snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
+       snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
                        sys_dir->d_name, evt_dir->d_name);
        fd = open(evt_path, O_RDONLY);
        if (fd < 0)
@@ -171,16 +171,16 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config)
        char evt_path[MAXPATHLEN];
        char dir_path[MAXPATHLEN];
 
-       if (debugfs_valid_mountpoint(debugfs_path))
+       if (debugfs_valid_mountpoint(tracing_events_path))
                return NULL;
 
-       sys_dir = opendir(debugfs_path);
+       sys_dir = opendir(tracing_events_path);
        if (!sys_dir)
                return NULL;
 
        for_each_subsystem(sys_dir, sys_dirent, sys_next) {
 
-               snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+               snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
                         sys_dirent.d_name);
                evt_dir = opendir(dir_path);
                if (!evt_dir)
@@ -447,7 +447,7 @@ parse_single_tracepoint_event(char *sys_name,
        u64 id;
        int fd;
 
-       snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
+       snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path,
                 sys_name, evt_name);
 
        fd = open(evt_path, O_RDONLY);
@@ -485,7 +485,7 @@ parse_multiple_tracepoint_event(struct perf_evlist *evlist, char *sys_name,
        struct dirent *evt_ent;
        DIR *evt_dir;
 
-       snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name);
+       snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name);
        evt_dir = opendir(evt_path);
 
        if (!evt_dir) {
@@ -528,7 +528,7 @@ parse_tracepoint_event(struct perf_evlist *evlist, const char **strp,
        char sys_name[MAX_EVENT_LENGTH];
        unsigned int sys_length, evt_length;
 
-       if (debugfs_valid_mountpoint(debugfs_path))
+       if (debugfs_valid_mountpoint(tracing_events_path))
                return 0;
 
        evt_name = strchr(*strp, ':');
@@ -920,10 +920,10 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
        char evt_path[MAXPATHLEN];
        char dir_path[MAXPATHLEN];
 
-       if (debugfs_valid_mountpoint(debugfs_path))
+       if (debugfs_valid_mountpoint(tracing_events_path))
                return;
 
-       sys_dir = opendir(debugfs_path);
+       sys_dir = opendir(tracing_events_path);
        if (!sys_dir)
                return;
 
@@ -932,7 +932,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob)
                    !strglobmatch(sys_dirent.d_name, subsys_glob))
                        continue;
 
-               snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+               snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
                         sys_dirent.d_name);
                evt_dir = opendir(dir_path);
                if (!evt_dir)
@@ -964,16 +964,16 @@ int is_valid_tracepoint(const char *event_string)
        char evt_path[MAXPATHLEN];
        char dir_path[MAXPATHLEN];
 
-       if (debugfs_valid_mountpoint(debugfs_path))
+       if (debugfs_valid_mountpoint(tracing_events_path))
                return 0;
 
-       sys_dir = opendir(debugfs_path);
+       sys_dir = opendir(tracing_events_path);
        if (!sys_dir)
                return 0;
 
        for_each_subsystem(sys_dir, sys_dirent, sys_next) {
 
-               snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+               snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path,
                         sys_dirent.d_name);
                evt_dir = opendir(dir_path);
                if (!evt_dir)
index 2f8e375..7e0cbe7 100644 (file)
@@ -39,7 +39,6 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob);
 int print_hwcache_events(const char *event_glob);
 extern int is_valid_tracepoint(const char *event_string);
 
-extern char debugfs_path[];
 extern int valid_debugfs_mount(const char *debugfs);
 
 #endif /* __PERF_PARSE_EVENTS_H */
index 1132c8f..17e94d0 100644 (file)
@@ -5,7 +5,6 @@
 #include "util.h"
 #include "probe-event.h"
 
-#define MAX_PATH_LEN            256
 #define MAX_PROBE_BUFFER       1024
 #define MAX_PROBES              128
 
index 74350ff..e30749e 100644 (file)
 
 #include "../../perf.h"
 #include "../util.h"
+#include "../thread.h"
+#include "../event.h"
 #include "../trace-event.h"
+#include "../evsel.h"
 
 #include <EXTERN.h>
 #include <perl.h>
@@ -245,11 +248,11 @@ static inline struct event *find_cache_event(int type)
        return event;
 }
 
-static void perl_process_event(union perf_event *pevent __unused,
-                              struct perf_sample *sample,
-                              struct perf_evsel *evsel,
-                              struct perf_session *session __unused,
-                              struct thread *thread)
+static void perl_process_tracepoint(union perf_event *pevent __unused,
+                                   struct perf_sample *sample,
+                                   struct perf_evsel *evsel,
+                                   struct machine *machine __unused,
+                                   struct thread *thread)
 {
        struct format_field *field;
        static char handler[256];
@@ -265,6 +268,9 @@ static void perl_process_event(union perf_event *pevent __unused,
 
        dSP;
 
+       if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+               return;
+
        type = trace_parse_common_type(data);
 
        event = find_cache_event(type);
@@ -332,6 +338,42 @@ static void perl_process_event(union perf_event *pevent __unused,
        LEAVE;
 }
 
+static void perl_process_event_generic(union perf_event *pevent __unused,
+                                      struct perf_sample *sample,
+                                      struct perf_evsel *evsel __unused,
+                                      struct machine *machine __unused,
+                                      struct thread *thread __unused)
+{
+       dSP;
+
+       if (!get_cv("process_event", 0))
+               return;
+
+       ENTER;
+       SAVETMPS;
+       PUSHMARK(SP);
+       XPUSHs(sv_2mortal(newSVpvn((const char *)pevent, pevent->header.size)));
+       XPUSHs(sv_2mortal(newSVpvn((const char *)&evsel->attr, sizeof(evsel->attr))));
+       XPUSHs(sv_2mortal(newSVpvn((const char *)sample, sizeof(*sample))));
+       XPUSHs(sv_2mortal(newSVpvn((const char *)sample->raw_data, sample->raw_size)));
+       PUTBACK;
+       call_pv("process_event", G_SCALAR);
+       SPAGAIN;
+       PUTBACK;
+       FREETMPS;
+       LEAVE;
+}
+
+static void perl_process_event(union perf_event *pevent,
+                              struct perf_sample *sample,
+                              struct perf_evsel *evsel,
+                              struct machine *machine,
+                              struct thread *thread)
+{
+       perl_process_tracepoint(pevent, sample, evsel, machine, thread);
+       perl_process_event_generic(pevent, sample, evsel, machine, thread);
+}
+
 static void run_start_sub(void)
 {
        dSP; /* access to Perl stack */
@@ -553,7 +595,28 @@ static int perl_generate_script(const char *outfile)
        fprintf(ofp, "sub print_header\n{\n"
                "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n"
                "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t       "
-               "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}");
+               "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}\n");
+
+       fprintf(ofp,
+               "\n# Packed byte string args of process_event():\n"
+               "#\n"
+               "# $event:\tunion perf_event\tutil/event.h\n"
+               "# $attr:\tstruct perf_event_attr\tlinux/perf_event.h\n"
+               "# $sample:\tstruct perf_sample\tutil/event.h\n"
+               "# $raw_data:\tperf_sample->raw_data\tutil/event.h\n"
+               "\n"
+               "sub process_event\n"
+               "{\n"
+               "\tmy ($event, $attr, $sample, $raw_data) = @_;\n"
+               "\n"
+               "\tmy @event\t= unpack(\"LSS\", $event);\n"
+               "\tmy @attr\t= unpack(\"LLQQQQQLLQQ\", $attr);\n"
+               "\tmy @sample\t= unpack(\"QLLQQQQQLL\", $sample);\n"
+               "\tmy @raw_data\t= unpack(\"C*\", $raw_data);\n"
+               "\n"
+               "\tuse Data::Dumper;\n"
+               "\tprint Dumper \\@event, \\@attr, \\@sample, \\@raw_data;\n"
+               "}\n");
 
        fclose(ofp);
 
index 6ccf70e..0b2a487 100644 (file)
@@ -29,6 +29,8 @@
 
 #include "../../perf.h"
 #include "../util.h"
+#include "../event.h"
+#include "../thread.h"
 #include "../trace-event.h"
 
 PyMODINIT_FUNC initperf_trace_context(void);
@@ -207,7 +209,7 @@ static inline struct event *find_cache_event(int type)
 static void python_process_event(union perf_event *pevent __unused,
                                 struct perf_sample *sample,
                                 struct perf_evsel *evsel __unused,
-                                struct perf_session *session __unused,
+                                struct machine *machine __unused,
                                 struct thread *thread)
 {
        PyObject *handler, *retval, *context, *t, *obj, *dict = NULL;
index 0f4555c..b5ca255 100644 (file)
@@ -10,6 +10,7 @@
 #include "evlist.h"
 #include "evsel.h"
 #include "session.h"
+#include "tool.h"
 #include "sort.h"
 #include "util.h"
 #include "cpumap.h"
@@ -78,39 +79,13 @@ out_close:
        return -1;
 }
 
-static void perf_session__id_header_size(struct perf_session *session)
-{
-       struct perf_sample *data;
-       u64 sample_type = session->sample_type;
-       u16 size = 0;
-
-       if (!session->sample_id_all)
-               goto out;
-
-       if (sample_type & PERF_SAMPLE_TID)
-               size += sizeof(data->tid) * 2;
-
-       if (sample_type & PERF_SAMPLE_TIME)
-               size += sizeof(data->time);
-
-       if (sample_type & PERF_SAMPLE_ID)
-               size += sizeof(data->id);
-
-       if (sample_type & PERF_SAMPLE_STREAM_ID)
-               size += sizeof(data->stream_id);
-
-       if (sample_type & PERF_SAMPLE_CPU)
-               size += sizeof(data->cpu) * 2;
-out:
-       session->id_hdr_size = size;
-}
-
 void perf_session__update_sample_type(struct perf_session *self)
 {
        self->sample_type = perf_evlist__sample_type(self->evlist);
        self->sample_size = __perf_evsel__sample_size(self->sample_type);
        self->sample_id_all = perf_evlist__sample_id_all(self->evlist);
-       perf_session__id_header_size(self);
+       self->id_hdr_size = perf_evlist__id_hdr_size(self->evlist);
+       self->host_machine.id_hdr_size = self->id_hdr_size;
 }
 
 int perf_session__create_kernel_maps(struct perf_session *self)
@@ -130,18 +105,26 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self)
 
 struct perf_session *perf_session__new(const char *filename, int mode,
                                       bool force, bool repipe,
-                                      struct perf_event_ops *ops)
+                                      struct perf_tool *tool)
 {
-       size_t len = filename ? strlen(filename) + 1 : 0;
-       struct perf_session *self = zalloc(sizeof(*self) + len);
+       struct perf_session *self;
+       struct stat st;
+       size_t len;
+
+       if (!filename || !strlen(filename)) {
+               if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
+                       filename = "-";
+               else
+                       filename = "perf.data";
+       }
+
+       len = strlen(filename);
+       self = zalloc(sizeof(*self) + len);
 
        if (self == NULL)
                goto out;
 
        memcpy(self->filename, filename, len);
-       self->threads = RB_ROOT;
-       INIT_LIST_HEAD(&self->dead_threads);
-       self->last_match = NULL;
        /*
         * On 64bit we can mmap the data file in one go. No need for tiny mmap
         * slices. On 32bit we use 32MB.
@@ -171,10 +154,10 @@ struct perf_session *perf_session__new(const char *filename, int mode,
                        goto out_delete;
        }
 
-       if (ops && ops->ordering_requires_timestamps &&
-           ops->ordered_samples && !self->sample_id_all) {
+       if (tool && tool->ordering_requires_timestamps &&
+           tool->ordered_samples && !self->sample_id_all) {
                dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
-               ops->ordered_samples = false;
+               tool->ordered_samples = false;
        }
 
 out:
@@ -184,17 +167,22 @@ out_delete:
        return NULL;
 }
 
-static void perf_session__delete_dead_threads(struct perf_session *self)
+static void machine__delete_dead_threads(struct machine *machine)
 {
        struct thread *n, *t;
 
-       list_for_each_entry_safe(t, n, &self->dead_threads, node) {
+       list_for_each_entry_safe(t, n, &machine->dead_threads, node) {
                list_del(&t->node);
                thread__delete(t);
        }
 }
 
-static void perf_session__delete_threads(struct perf_session *self)
+static void perf_session__delete_dead_threads(struct perf_session *session)
+{
+       machine__delete_dead_threads(&session->host_machine);
+}
+
+static void machine__delete_threads(struct machine *self)
 {
        struct rb_node *nd = rb_first(&self->threads);
 
@@ -207,6 +195,11 @@ static void perf_session__delete_threads(struct perf_session *self)
        }
 }
 
+static void perf_session__delete_threads(struct perf_session *session)
+{
+       machine__delete_threads(&session->host_machine);
+}
+
 void perf_session__delete(struct perf_session *self)
 {
        perf_session__destroy_kernel_maps(self);
@@ -217,7 +210,7 @@ void perf_session__delete(struct perf_session *self)
        free(self);
 }
 
-void perf_session__remove_thread(struct perf_session *self, struct thread *th)
+void machine__remove_thread(struct machine *self, struct thread *th)
 {
        self->last_match = NULL;
        rb_erase(&th->rb_node, &self->threads);
@@ -236,16 +229,16 @@ static bool symbol__match_parent_regex(struct symbol *sym)
        return 0;
 }
 
-int perf_session__resolve_callchain(struct perf_session *self,
-                                   struct thread *thread,
-                                   struct ip_callchain *chain,
-                                   struct symbol **parent)
+int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel,
+                              struct thread *thread,
+                              struct ip_callchain *chain,
+                              struct symbol **parent)
 {
        u8 cpumode = PERF_RECORD_MISC_USER;
        unsigned int i;
        int err;
 
-       callchain_cursor_reset(&self->callchain_cursor);
+       callchain_cursor_reset(&evsel->hists.callchain_cursor);
 
        for (i = 0; i < chain->nr; i++) {
                u64 ip;
@@ -272,7 +265,7 @@ int perf_session__resolve_callchain(struct perf_session *self,
 
                al.filtered = false;
                thread__find_addr_location(thread, self, cpumode,
-                               MAP__FUNCTION, thread->pid, ip, &al, NULL);
+                                          MAP__FUNCTION, ip, &al, NULL);
                if (al.sym != NULL) {
                        if (sort__has_parent && !*parent &&
                            symbol__match_parent_regex(al.sym))
@@ -281,7 +274,7 @@ int perf_session__resolve_callchain(struct perf_session *self,
                                break;
                }
 
-               err = callchain_cursor_append(&self->callchain_cursor,
+               err = callchain_cursor_append(&evsel->hists.callchain_cursor,
                                              ip, al.map, al.sym);
                if (err)
                        return err;
@@ -290,75 +283,91 @@ int perf_session__resolve_callchain(struct perf_session *self,
        return 0;
 }
 
-static int process_event_synth_stub(union perf_event *event __used,
-                                   struct perf_session *session __used)
+static int process_event_synth_tracing_data_stub(union perf_event *event __used,
+                                                struct perf_session *session __used)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
+static int process_event_synth_attr_stub(union perf_event *event __used,
+                                        struct perf_evlist **pevlist __used)
 {
        dump_printf(": unhandled!\n");
        return 0;
 }
 
-static int process_event_sample_stub(union perf_event *event __used,
+static int process_event_sample_stub(struct perf_tool *tool __used,
+                                    union perf_event *event __used,
                                     struct perf_sample *sample __used,
                                     struct perf_evsel *evsel __used,
-                                    struct perf_session *session __used)
+                                    struct machine *machine __used)
 {
        dump_printf(": unhandled!\n");
        return 0;
 }
 
-static int process_event_stub(union perf_event *event __used,
+static int process_event_stub(struct perf_tool *tool __used,
+                             union perf_event *event __used,
                              struct perf_sample *sample __used,
-                             struct perf_session *session __used)
+                             struct machine *machine __used)
 {
        dump_printf(": unhandled!\n");
        return 0;
 }
 
-static int process_finished_round_stub(union perf_event *event __used,
-                                      struct perf_session *session __used,
-                                      struct perf_event_ops *ops __used)
+static int process_finished_round_stub(struct perf_tool *tool __used,
+                                      union perf_event *event __used,
+                                      struct perf_session *perf_session __used)
 {
        dump_printf(": unhandled!\n");
        return 0;
 }
 
-static int process_finished_round(union perf_event *event,
-                                 struct perf_session *session,
-                                 struct perf_event_ops *ops);
+static int process_event_type_stub(struct perf_tool *tool __used,
+                                  union perf_event *event __used)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
 
-static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
+static int process_finished_round(struct perf_tool *tool,
+                                 union perf_event *event,
+                                 struct perf_session *session);
+
+static void perf_tool__fill_defaults(struct perf_tool *tool)
 {
-       if (handler->sample == NULL)
-               handler->sample = process_event_sample_stub;
-       if (handler->mmap == NULL)
-               handler->mmap = process_event_stub;
-       if (handler->comm == NULL)
-               handler->comm = process_event_stub;
-       if (handler->fork == NULL)
-               handler->fork = process_event_stub;
-       if (handler->exit == NULL)
-               handler->exit = process_event_stub;
-       if (handler->lost == NULL)
-               handler->lost = perf_event__process_lost;
-       if (handler->read == NULL)
-               handler->read = process_event_stub;
-       if (handler->throttle == NULL)
-               handler->throttle = process_event_stub;
-       if (handler->unthrottle == NULL)
-               handler->unthrottle = process_event_stub;
-       if (handler->attr == NULL)
-               handler->attr = process_event_synth_stub;
-       if (handler->event_type == NULL)
-               handler->event_type = process_event_synth_stub;
-       if (handler->tracing_data == NULL)
-               handler->tracing_data = process_event_synth_stub;
-       if (handler->build_id == NULL)
-               handler->build_id = process_event_synth_stub;
-       if (handler->finished_round == NULL) {
-               if (handler->ordered_samples)
-                       handler->finished_round = process_finished_round;
+       if (tool->sample == NULL)
+               tool->sample = process_event_sample_stub;
+       if (tool->mmap == NULL)
+               tool->mmap = process_event_stub;
+       if (tool->comm == NULL)
+               tool->comm = process_event_stub;
+       if (tool->fork == NULL)
+               tool->fork = process_event_stub;
+       if (tool->exit == NULL)
+               tool->exit = process_event_stub;
+       if (tool->lost == NULL)
+               tool->lost = perf_event__process_lost;
+       if (tool->read == NULL)
+               tool->read = process_event_sample_stub;
+       if (tool->throttle == NULL)
+               tool->throttle = process_event_stub;
+       if (tool->unthrottle == NULL)
+               tool->unthrottle = process_event_stub;
+       if (tool->attr == NULL)
+               tool->attr = process_event_synth_attr_stub;
+       if (tool->event_type == NULL)
+               tool->event_type = process_event_type_stub;
+       if (tool->tracing_data == NULL)
+               tool->tracing_data = process_event_synth_tracing_data_stub;
+       if (tool->build_id == NULL)
+               tool->build_id = process_finished_round_stub;
+       if (tool->finished_round == NULL) {
+               if (tool->ordered_samples)
+                       tool->finished_round = process_finished_round;
                else
-                       handler->finished_round = process_finished_round_stub;
+                       tool->finished_round = process_finished_round_stub;
        }
 }
 
@@ -490,11 +499,11 @@ static void perf_session_free_sample_buffers(struct perf_session *session)
 static int perf_session_deliver_event(struct perf_session *session,
                                      union perf_event *event,
                                      struct perf_sample *sample,
-                                     struct perf_event_ops *ops,
+                                     struct perf_tool *tool,
                                      u64 file_offset);
 
 static void flush_sample_queue(struct perf_session *s,
-                              struct perf_event_ops *ops)
+                              struct perf_tool *tool)
 {
        struct ordered_samples *os = &s->ordered_samples;
        struct list_head *head = &os->samples;
@@ -505,7 +514,7 @@ static void flush_sample_queue(struct perf_session *s,
        unsigned idx = 0, progress_next = os->nr_samples / 16;
        int ret;
 
-       if (!ops->ordered_samples || !limit)
+       if (!tool->ordered_samples || !limit)
                return;
 
        list_for_each_entry_safe(iter, tmp, head, list) {
@@ -516,7 +525,7 @@ static void flush_sample_queue(struct perf_session *s,
                if (ret)
                        pr_err("Can't parse sample, err = %d\n", ret);
                else
-                       perf_session_deliver_event(s, iter->event, &sample, ops,
+                       perf_session_deliver_event(s, iter->event, &sample, tool,
                                                   iter->file_offset);
 
                os->last_flush = iter->timestamp;
@@ -578,11 +587,11 @@ static void flush_sample_queue(struct perf_session *s,
  *      Flush every events below timestamp 7
  *      etc...
  */
-static int process_finished_round(union perf_event *event __used,
-                                 struct perf_session *session,
-                                 struct perf_event_ops *ops)
+static int process_finished_round(struct perf_tool *tool,
+                                 union perf_event *event __used,
+                                 struct perf_session *session)
 {
-       flush_sample_queue(session, ops);
+       flush_sample_queue(session, tool);
        session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
 
        return 0;
@@ -737,13 +746,26 @@ static void dump_sample(struct perf_session *session, union perf_event *event,
                callchain__printf(sample);
 }
 
+static struct machine *
+       perf_session__find_machine_for_cpumode(struct perf_session *session,
+                                              union perf_event *event)
+{
+       const u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+       if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest)
+               return perf_session__find_machine(session, event->ip.pid);
+
+       return perf_session__find_host_machine(session);
+}
+
 static int perf_session_deliver_event(struct perf_session *session,
                                      union perf_event *event,
                                      struct perf_sample *sample,
-                                     struct perf_event_ops *ops,
+                                     struct perf_tool *tool,
                                      u64 file_offset)
 {
        struct perf_evsel *evsel;
+       struct machine *machine;
 
        dump_event(session, event, file_offset, sample);
 
@@ -765,6 +787,8 @@ static int perf_session_deliver_event(struct perf_session *session,
                hists__inc_nr_events(&evsel->hists, event->header.type);
        }
 
+       machine = perf_session__find_machine_for_cpumode(session, event);
+
        switch (event->header.type) {
        case PERF_RECORD_SAMPLE:
                dump_sample(session, event, sample);
@@ -772,23 +796,25 @@ static int perf_session_deliver_event(struct perf_session *session,
                        ++session->hists.stats.nr_unknown_id;
                        return -1;
                }
-               return ops->sample(event, sample, evsel, session);
+               return tool->sample(tool, event, sample, evsel, machine);
        case PERF_RECORD_MMAP:
-               return ops->mmap(event, sample, session);
+               return tool->mmap(tool, event, sample, machine);
        case PERF_RECORD_COMM:
-               return ops->comm(event, sample, session);
+               return tool->comm(tool, event, sample, machine);
        case PERF_RECORD_FORK:
-               return ops->fork(event, sample, session);
+               return tool->fork(tool, event, sample, machine);
        case PERF_RECORD_EXIT:
-               return ops->exit(event, sample, session);
+               return tool->exit(tool, event, sample, machine);
        case PERF_RECORD_LOST:
-               return ops->lost(event, sample, session);
+               if (tool->lost == perf_event__process_lost)
+                       session->hists.stats.total_lost += event->lost.lost;
+               return tool->lost(tool, event, sample, machine);
        case PERF_RECORD_READ:
-               return ops->read(event, sample, session);
+               return tool->read(tool, event, sample, evsel, machine);
        case PERF_RECORD_THROTTLE:
-               return ops->throttle(event, sample, session);
+               return tool->throttle(tool, event, sample, machine);
        case PERF_RECORD_UNTHROTTLE:
-               return ops->unthrottle(event, sample, session);
+               return tool->unthrottle(tool, event, sample, machine);
        default:
                ++session->hists.stats.nr_unknown_events;
                return -1;
@@ -812,24 +838,29 @@ static int perf_session__preprocess_sample(struct perf_session *session,
 }
 
 static int perf_session__process_user_event(struct perf_session *session, union perf_event *event,
-                                           struct perf_event_ops *ops, u64 file_offset)
+                                           struct perf_tool *tool, u64 file_offset)
 {
+       int err;
+
        dump_event(session, event, file_offset, NULL);
 
        /* These events are processed right away */
        switch (event->header.type) {
        case PERF_RECORD_HEADER_ATTR:
-               return ops->attr(event, session);
+               err = tool->attr(event, &session->evlist);
+               if (err == 0)
+                       perf_session__update_sample_type(session);
+               return err;
        case PERF_RECORD_HEADER_EVENT_TYPE:
-               return ops->event_type(event, session);
+               return tool->event_type(tool, event);
        case PERF_RECORD_HEADER_TRACING_DATA:
                /* setup for reading amidst mmap */
                lseek(session->fd, file_offset, SEEK_SET);
-               return ops->tracing_data(event, session);
+               return tool->tracing_data(event, session);
        case PERF_RECORD_HEADER_BUILD_ID:
-               return ops->build_id(event, session);
+               return tool->build_id(tool, event, session);
        case PERF_RECORD_FINISHED_ROUND:
-               return ops->finished_round(event, session, ops);
+               return tool->finished_round(tool, event, session);
        default:
                return -EINVAL;
        }
@@ -837,7 +868,7 @@ static int perf_session__process_user_event(struct perf_session *session, union
 
 static int perf_session__process_event(struct perf_session *session,
                                       union perf_event *event,
-                                      struct perf_event_ops *ops,
+                                      struct perf_tool *tool,
                                       u64 file_offset)
 {
        struct perf_sample sample;
@@ -853,7 +884,7 @@ static int perf_session__process_event(struct perf_session *session,
        hists__inc_nr_events(&session->hists, event->header.type);
 
        if (event->header.type >= PERF_RECORD_USER_TYPE_START)
-               return perf_session__process_user_event(session, event, ops, file_offset);
+               return perf_session__process_user_event(session, event, tool, file_offset);
 
        /*
         * For all kernel events we get the sample data
@@ -866,14 +897,14 @@ static int perf_session__process_event(struct perf_session *session,
        if (perf_session__preprocess_sample(session, event, &sample))
                return 0;
 
-       if (ops->ordered_samples) {
+       if (tool->ordered_samples) {
                ret = perf_session_queue_event(session, event, &sample,
                                               file_offset);
                if (ret != -ETIME)
                        return ret;
        }
 
-       return perf_session_deliver_event(session, event, &sample, ops,
+       return perf_session_deliver_event(session, event, &sample, tool,
                                          file_offset);
 }
 
@@ -884,6 +915,11 @@ void perf_event_header__bswap(struct perf_event_header *self)
        self->size = bswap_16(self->size);
 }
 
+struct thread *perf_session__findnew(struct perf_session *session, pid_t pid)
+{
+       return machine__findnew_thread(&session->host_machine, pid);
+}
+
 static struct thread *perf_session__register_idle_thread(struct perf_session *self)
 {
        struct thread *thread = perf_session__findnew(self, 0);
@@ -897,9 +933,9 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
 }
 
 static void perf_session__warn_about_errors(const struct perf_session *session,
-                                           const struct perf_event_ops *ops)
+                                           const struct perf_tool *tool)
 {
-       if (ops->lost == perf_event__process_lost &&
+       if (tool->lost == perf_event__process_lost &&
            session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) {
                ui__warning("Processed %d events and lost %d chunks!\n\n"
                            "Check IO/CPU overload!\n\n",
@@ -934,7 +970,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session,
 volatile int session_done;
 
 static int __perf_session__process_pipe_events(struct perf_session *self,
-                                              struct perf_event_ops *ops)
+                                              struct perf_tool *tool)
 {
        union perf_event event;
        uint32_t size;
@@ -943,7 +979,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self,
        int err;
        void *p;
 
-       perf_event_ops__fill_defaults(ops);
+       perf_tool__fill_defaults(tool);
 
        head = 0;
 more:
@@ -979,8 +1015,7 @@ more:
                }
        }
 
-       if (size == 0 ||
-           (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
+       if ((skip = perf_session__process_event(self, &event, tool, head)) < 0) {
                dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n",
                            head, event.header.size, event.header.type);
                /*
@@ -1003,7 +1038,7 @@ more:
 done:
        err = 0;
 out_err:
-       perf_session__warn_about_errors(self, ops);
+       perf_session__warn_about_errors(self, tool);
        perf_session_free_sample_buffers(self);
        return err;
 }
@@ -1034,7 +1069,7 @@ fetch_mmaped_event(struct perf_session *session,
 
 int __perf_session__process_events(struct perf_session *session,
                                   u64 data_offset, u64 data_size,
-                                  u64 file_size, struct perf_event_ops *ops)
+                                  u64 file_size, struct perf_tool *tool)
 {
        u64 head, page_offset, file_offset, file_pos, progress_next;
        int err, mmap_prot, mmap_flags, map_idx = 0;
@@ -1043,7 +1078,7 @@ int __perf_session__process_events(struct perf_session *session,
        union perf_event *event;
        uint32_t size;
 
-       perf_event_ops__fill_defaults(ops);
+       perf_tool__fill_defaults(tool);
 
        page_size = sysconf(_SC_PAGESIZE);
 
@@ -1098,7 +1133,7 @@ more:
        size = event->header.size;
 
        if (size == 0 ||
-           perf_session__process_event(session, event, ops, file_pos) < 0) {
+           perf_session__process_event(session, event, tool, file_pos) < 0) {
                dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n",
                            file_offset + head, event->header.size,
                            event->header.type);
@@ -1127,15 +1162,15 @@ more:
        err = 0;
        /* do the final flush for ordered samples */
        session->ordered_samples.next_flush = ULLONG_MAX;
-       flush_sample_queue(session, ops);
+       flush_sample_queue(session, tool);
 out_err:
-       perf_session__warn_about_errors(session, ops);
+       perf_session__warn_about_errors(session, tool);
        perf_session_free_sample_buffers(session);
        return err;
 }
 
 int perf_session__process_events(struct perf_session *self,
-                                struct perf_event_ops *ops)
+                                struct perf_tool *tool)
 {
        int err;
 
@@ -1146,9 +1181,9 @@ int perf_session__process_events(struct perf_session *self,
                err = __perf_session__process_events(self,
                                                     self->header.data_offset,
                                                     self->header.data_size,
-                                                    self->size, ops);
+                                                    self->size, tool);
        else
-               err = __perf_session__process_pipe_events(self, ops);
+               err = __perf_session__process_pipe_events(self, tool);
 
        return err;
 }
@@ -1163,9 +1198,8 @@ bool perf_session__has_traces(struct perf_session *self, const char *msg)
        return true;
 }
 
-int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
-                                            const char *symbol_name,
-                                            u64 addr)
+int maps__set_kallsyms_ref_reloc_sym(struct map **maps,
+                                    const char *symbol_name, u64 addr)
 {
        char *bracket;
        enum map_type i;
@@ -1224,6 +1258,27 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp)
        return ret;
 }
 
+size_t perf_session__fprintf(struct perf_session *session, FILE *fp)
+{
+       /*
+        * FIXME: Here we have to actually print all the machines in this
+        * session, not just the host...
+        */
+       return machine__fprintf(&session->host_machine, fp);
+}
+
+void perf_session__remove_thread(struct perf_session *session,
+                                struct thread *th)
+{
+       /*
+        * FIXME: This one makes no sense, we need to remove the thread from
+        * the machine it belongs to, perf_session can have many machines, so
+        * doing it always on ->host_machine is wrong.  Fix when auditing all
+        * the 'perf kvm' code.
+        */
+       machine__remove_thread(&session->host_machine, th);
+}
+
 struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
                                              unsigned int type)
 {
@@ -1236,17 +1291,16 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
        return NULL;
 }
 
-void perf_session__print_ip(union perf_event *event,
-                           struct perf_sample *sample,
-                           struct perf_session *session,
-                           int print_sym, int print_dso)
+void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
+                         struct machine *machine, struct perf_evsel *evsel,
+                         int print_sym, int print_dso)
 {
        struct addr_location al;
        const char *symname, *dsoname;
-       struct callchain_cursor *cursor = &session->callchain_cursor;
+       struct callchain_cursor *cursor = &evsel->hists.callchain_cursor;
        struct callchain_cursor_node *node;
 
-       if (perf_event__preprocess_sample(event, session, &al, sample,
+       if (perf_event__preprocess_sample(event, machine, &al, sample,
                                          NULL) < 0) {
                error("problem processing %d event, skipping it.\n",
                        event->header.type);
@@ -1255,7 +1309,7 @@ void perf_session__print_ip(union perf_event *event,
 
        if (symbol_conf.use_callchain && sample->callchain) {
 
-               if (perf_session__resolve_callchain(session, al.thread,
+               if (machine__resolve_callchain(machine, evsel, al.thread,
                                                sample->callchain, NULL) != 0) {
                        if (verbose)
                                error("Failed to resolve callchain. Skipping\n");
index 6e393c9..37bc383 100644 (file)
@@ -30,9 +30,6 @@ struct perf_session {
        struct perf_header      header;
        unsigned long           size;
        unsigned long           mmap_window;
-       struct rb_root          threads;
-       struct list_head        dead_threads;
-       struct thread           *last_match;
        struct machine          host_machine;
        struct rb_root          machines;
        struct perf_evlist      *evlist;
@@ -53,65 +50,31 @@ struct perf_session {
        int                     cwdlen;
        char                    *cwd;
        struct ordered_samples  ordered_samples;
-       struct callchain_cursor callchain_cursor;
-       char                    filename[0];
+       char                    filename[1];
 };
 
-struct perf_evsel;
-struct perf_event_ops;
-
-typedef int (*event_sample)(union perf_event *event, struct perf_sample *sample,
-                           struct perf_evsel *evsel, struct perf_session *session);
-typedef int (*event_op)(union perf_event *self, struct perf_sample *sample,
-                       struct perf_session *session);
-typedef int (*event_synth_op)(union perf_event *self,
-                             struct perf_session *session);
-typedef int (*event_op2)(union perf_event *self, struct perf_session *session,
-                        struct perf_event_ops *ops);
-
-struct perf_event_ops {
-       event_sample    sample;
-       event_op        mmap,
-                       comm,
-                       fork,
-                       exit,
-                       lost,
-                       read,
-                       throttle,
-                       unthrottle;
-       event_synth_op  attr,
-                       event_type,
-                       tracing_data,
-                       build_id;
-       event_op2       finished_round;
-       bool            ordered_samples;
-       bool            ordering_requires_timestamps;
-};
+struct perf_tool;
 
 struct perf_session *perf_session__new(const char *filename, int mode,
                                       bool force, bool repipe,
-                                      struct perf_event_ops *ops);
+                                      struct perf_tool *tool);
 void perf_session__delete(struct perf_session *self);
 
 void perf_event_header__bswap(struct perf_event_header *self);
 
 int __perf_session__process_events(struct perf_session *self,
                                   u64 data_offset, u64 data_size, u64 size,
-                                  struct perf_event_ops *ops);
+                                  struct perf_tool *tool);
 int perf_session__process_events(struct perf_session *self,
-                                struct perf_event_ops *event_ops);
+                                struct perf_tool *tool);
 
-int perf_session__resolve_callchain(struct perf_session *self,
+int perf_session__resolve_callchain(struct perf_session *self, struct perf_evsel *evsel,
                                    struct thread *thread,
                                    struct ip_callchain *chain,
                                    struct symbol **parent);
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
-                                            const char *symbol_name,
-                                            u64 addr);
-
 void mem_bswap_64(void *src, int byte_size);
 void perf_event__attr_swap(struct perf_event_attr *attr);
 
@@ -144,12 +107,16 @@ struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t p
 
 static inline
 void perf_session__process_machines(struct perf_session *self,
+                                   struct perf_tool *tool,
                                    machine__process_t process)
 {
-       process(&self->host_machine, self);
-       return machines__process(&self->machines, process, self);
+       process(&self->host_machine, tool);
+       return machines__process(&self->machines, process, tool);
 }
 
+struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
+size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
+
 size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
 
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *self,
@@ -167,13 +134,20 @@ static inline int perf_session__parse_sample(struct perf_session *session,
                                        session->header.needs_swap);
 }
 
+static inline int perf_session__synthesize_sample(struct perf_session *session,
+                                                 union perf_event *event,
+                                                 const struct perf_sample *sample)
+{
+       return perf_event__synthesize_sample(event, session->sample_type,
+                                            sample, session->header.needs_swap);
+}
+
 struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session,
                                            unsigned int type);
 
-void perf_session__print_ip(union perf_event *event,
-                                struct perf_sample *sample,
-                                struct perf_session *session,
-                                int print_sym, int print_dso);
+void perf_event__print_ip(union perf_event *event, struct perf_sample *sample,
+                         struct machine *machine, struct perf_evsel *evsel,
+                         int print_sym, int print_dso);
 
 int perf_session__cpu_bitmap(struct perf_session *session,
                             const char *cpu_list, unsigned long *cpu_bitmap);
index 95d3700..36d4c56 100644 (file)
@@ -27,7 +27,8 @@ build_tmp = getenv('PYTHON_EXTBUILD_TMP')
 perf = Extension('perf',
                  sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c',
                             'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c',
-                            'util/util.c', 'util/xyarray.c', 'util/cgroup.c'],
+                            'util/util.c', 'util/xyarray.c', 'util/cgroup.c',
+                            'util/debugfs.c'],
                  include_dirs = ['util/include'],
                  extra_compile_args = cflags,
                  )
index 632b50c..215d50f 100644 (file)
@@ -1757,7 +1757,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
                struct stat st;
 
                /*sshfs might return bad dent->d_type, so we have to stat*/
-               sprintf(path, "%s/%s", dir_name, dent->d_name);
+               snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name);
                if (stat(path, &st))
                        continue;
 
@@ -1766,8 +1766,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
                            !strcmp(dent->d_name, ".."))
                                continue;
 
-                       snprintf(path, sizeof(path), "%s/%s",
-                                dir_name, dent->d_name);
                        ret = map_groups__set_modules_path_dir(mg, path);
                        if (ret < 0)
                                goto out;
@@ -1788,9 +1786,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg,
                        if (map == NULL)
                                continue;
 
-                       snprintf(path, sizeof(path), "%s/%s",
-                                dir_name, dent->d_name);
-
                        long_name = strdup(path);
                        if (long_name == NULL) {
                                ret = -1;
@@ -2609,10 +2604,10 @@ int symbol__init(void)
        symbol_conf.initialized = true;
        return 0;
 
-out_free_dso_list:
-       strlist__delete(symbol_conf.dso_list);
 out_free_comm_list:
        strlist__delete(symbol_conf.comm_list);
+out_free_dso_list:
+       strlist__delete(symbol_conf.dso_list);
        return -1;
 }
 
index 29f8d74..123c2e1 100644 (file)
@@ -68,6 +68,7 @@ struct strlist;
 
 struct symbol_conf {
        unsigned short  priv_size;
+       unsigned short  nr_events;
        bool            try_vmlinux_path,
                        use_modules,
                        sort_by_name,
index d5d3b22..fb4b7ea 100644 (file)
@@ -61,7 +61,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
               map_groups__fprintf(&self->mg, verbose, fp);
 }
 
-struct thread *perf_session__findnew(struct perf_session *self, pid_t pid)
+struct thread *machine__findnew_thread(struct machine *self, pid_t pid)
 {
        struct rb_node **p = &self->threads.rb_node;
        struct rb_node *parent = NULL;
@@ -125,12 +125,12 @@ int thread__fork(struct thread *self, struct thread *parent)
        return 0;
 }
 
-size_t perf_session__fprintf(struct perf_session *self, FILE *fp)
+size_t machine__fprintf(struct machine *machine, FILE *fp)
 {
        size_t ret = 0;
        struct rb_node *nd;
 
-       for (nd = rb_first(&self->threads); nd; nd = rb_next(nd)) {
+       for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) {
                struct thread *pos = rb_entry(nd, struct thread, rb_node);
 
                ret += thread__fprintf(pos, fp);
index e5f2401..70c2c13 100644 (file)
@@ -18,16 +18,14 @@ struct thread {
        int                     comm_len;
 };
 
-struct perf_session;
+struct machine;
 
 void thread__delete(struct thread *self);
 
 int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
-struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
 void thread__insert_map(struct thread *self, struct map *map);
 int thread__fork(struct thread *self, struct thread *parent);
-size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
 
 static inline struct map *thread__find_map(struct thread *self,
                                           enum map_type type, u64 addr)
@@ -35,14 +33,12 @@ static inline struct map *thread__find_map(struct thread *self,
        return self ? map_groups__find(&self->mg, type, addr) : NULL;
 }
 
-void thread__find_addr_map(struct thread *self,
-                          struct perf_session *session, u8 cpumode,
-                          enum map_type type, pid_t pid, u64 addr,
+void thread__find_addr_map(struct thread *thread, struct machine *machine,
+                          u8 cpumode, enum map_type type, u64 addr,
                           struct addr_location *al);
 
-void thread__find_addr_location(struct thread *self,
-                               struct perf_session *session, u8 cpumode,
-                               enum map_type type, pid_t pid, u64 addr,
+void thread__find_addr_location(struct thread *thread, struct machine *machine,
+                               u8 cpumode, enum map_type type, u64 addr,
                                struct addr_location *al,
                                symbol_filter_t filter);
 #endif /* __PERF_THREAD_H */
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
new file mode 100644 (file)
index 0000000..b0e1aad
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef __PERF_TOOL_H
+#define __PERF_TOOL_H
+
+#include <stdbool.h>
+
+struct perf_session;
+union perf_event;
+struct perf_evlist;
+struct perf_evsel;
+struct perf_sample;
+struct perf_tool;
+struct machine;
+
+typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event,
+                           struct perf_sample *sample,
+                           struct perf_evsel *evsel, struct machine *machine);
+
+typedef int (*event_op)(struct perf_tool *tool, union perf_event *event,
+                       struct perf_sample *sample, struct machine *machine);
+
+typedef int (*event_attr_op)(union perf_event *event,
+                            struct perf_evlist **pevlist);
+typedef int (*event_simple_op)(struct perf_tool *tool, union perf_event *event);
+
+typedef int (*event_synth_op)(union perf_event *event,
+                             struct perf_session *session);
+
+typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event,
+                        struct perf_session *session);
+
+struct perf_tool {
+       event_sample    sample,
+                       read;
+       event_op        mmap,
+                       comm,
+                       fork,
+                       exit,
+                       lost,
+                       throttle,
+                       unthrottle;
+       event_attr_op   attr;
+       event_synth_op  tracing_data;
+       event_simple_op event_type;
+       event_op2       finished_round,
+                       build_id;
+       bool            ordered_samples;
+       bool            ordering_requires_timestamps;
+};
+
+#endif /* __PERF_TOOL_H */
index 3996509..a248f3c 100644 (file)
@@ -1,15 +1,17 @@
 #ifndef __PERF_TOP_H
 #define __PERF_TOP_H 1
 
+#include "tool.h"
 #include "types.h"
-#include "../perf.h"
 #include <stddef.h>
+#include <stdbool.h>
 
 struct perf_evlist;
 struct perf_evsel;
 struct perf_session;
 
 struct perf_top {
+       struct perf_tool   tool;
        struct perf_evlist *evlist;
        /*
         * Symbols will be added here in perf_event__process_sample and will
@@ -23,10 +25,26 @@ struct perf_top {
        int                freq;
        pid_t              target_pid, target_tid;
        bool               hide_kernel_symbols, hide_user_symbols, zero;
+       bool               system_wide;
+       bool               use_tui, use_stdio;
+       bool               sort_has_symbols;
+       bool               dont_use_callchains;
+       bool               kptr_restrict_warned;
+       bool               vmlinux_warned;
+       bool               inherit;
+       bool               group;
+       bool               sample_id_all_avail;
+       bool               dump_symtab;
        const char         *cpu_list;
        struct hist_entry  *sym_filter_entry;
        struct perf_evsel  *sym_evsel;
        struct perf_session *session;
+       struct winsize     winsize;
+       unsigned int       mmap_pages;
+       int                default_interval;
+       int                realtime_prio;
+       int                sym_pcnt_filter;
+       const char         *sym_filter;
 };
 
 size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size);
index d2655f0..ac6830d 100644 (file)
@@ -18,7 +18,8 @@
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
-#define _GNU_SOURCE
+#include <ctype.h>
+#include "util.h"
 #include <dirent.h>
 #include <mntent.h>
 #include <stdio.h>
@@ -31,7 +32,6 @@
 #include <pthread.h>
 #include <fcntl.h>
 #include <unistd.h>
-#include <ctype.h>
 #include <errno.h>
 #include <stdbool.h>
 #include <linux/list.h>
 
 #define VERSION "0.5"
 
-#define _STR(x) #x
-#define STR(x) _STR(x)
-#define MAX_PATH 256
-
 #define TRACE_CTRL     "tracing_on"
 #define TRACE          "trace"
 #define AVAILABLE      "available_tracers"
@@ -73,26 +69,6 @@ struct events {
 };
 
 
-
-static void die(const char *fmt, ...)
-{
-       va_list ap;
-       int ret = errno;
-
-       if (errno)
-               perror("perf");
-       else
-               ret = -1;
-
-       va_start(ap, fmt);
-       fprintf(stderr, "  ");
-       vfprintf(stderr, fmt, ap);
-       va_end(ap);
-
-       fprintf(stderr, "\n");
-       exit(ret);
-}
-
 void *malloc_or_die(unsigned int size)
 {
        void *data;
index c9dcbec..a3fdf55 100644 (file)
@@ -39,7 +39,7 @@ static int stop_script_unsupported(void)
 static void process_event_unsupported(union perf_event *event __unused,
                                      struct perf_sample *sample __unused,
                                      struct perf_evsel *evsel __unused,
-                                     struct perf_session *session __unused,
+                                     struct machine *machine __unused,
                                      struct thread *thread __unused)
 {
 }
index a841008..58ae14c 100644 (file)
@@ -3,7 +3,11 @@
 
 #include <stdbool.h>
 #include "parse-events.h"
-#include "session.h"
+
+struct machine;
+struct perf_sample;
+union perf_event;
+struct thread;
 
 #define __unused __attribute__((unused))
 
@@ -292,7 +296,7 @@ struct scripting_ops {
        void (*process_event) (union perf_event *event,
                               struct perf_sample *sample,
                               struct perf_evsel *evsel,
-                              struct perf_session *session,
+                              struct machine *machine,
                               struct thread *thread);
        int (*generate_script) (const char *outfile);
 };
index 0575905..295a9c9 100644 (file)
@@ -224,7 +224,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser)
 }
 
 static int annotate_browser__run(struct annotate_browser *self, int evidx,
-                                int nr_events, void(*timer)(void *arg),
+                                void(*timer)(void *arg),
                                 void *arg, int delay_secs)
 {
        struct rb_node *nd = NULL;
@@ -328,8 +328,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx,
                                notes = symbol__annotation(target);
                                pthread_mutex_lock(&notes->lock);
 
-                               if (notes->src == NULL &&
-                                   symbol__alloc_hist(target, nr_events) < 0) {
+                               if (notes->src == NULL && symbol__alloc_hist(target) < 0) {
                                        pthread_mutex_unlock(&notes->lock);
                                        ui__warning("Not enough memory for annotating '%s' symbol!\n",
                                                    target->name);
@@ -337,7 +336,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx,
                                }
 
                                pthread_mutex_unlock(&notes->lock);
-                               symbol__tui_annotate(target, ms->map, evidx, nr_events,
+                               symbol__tui_annotate(target, ms->map, evidx,
                                                     timer, arg, delay_secs);
                        }
                        continue;
@@ -358,15 +357,15 @@ out:
        return key;
 }
 
-int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events,
+int hist_entry__tui_annotate(struct hist_entry *he, int evidx,
                             void(*timer)(void *arg), void *arg, int delay_secs)
 {
-       return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, nr_events,
+       return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx,
                                    timer, arg, delay_secs);
 }
 
 int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
-                        int nr_events, void(*timer)(void *arg), void *arg,
+                        void(*timer)(void *arg), void *arg,
                         int delay_secs)
 {
        struct objdump_line *pos, *n;
@@ -419,8 +418,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx,
        browser.b.nr_entries = browser.nr_entries;
        browser.b.entries = &notes->src->source,
        browser.b.width += 18; /* Percentage */
-       ret = annotate_browser__run(&browser, evidx, nr_events,
-                                   timer, arg, delay_secs);
+       ret = annotate_browser__run(&browser, evidx, timer, arg, delay_secs);
        list_for_each_entry_safe(pos, n, &notes->src->source, node) {
                list_del(&pos->node);
                objdump_line__free(pos);
index d0c94b4..1212a38 100644 (file)
@@ -1020,7 +1020,7 @@ do_annotate:
                         * Don't let this be freed, say, by hists__decay_entry.
                         */
                        he->used = true;
-                       err = hist_entry__tui_annotate(he, evsel->idx, nr_events,
+                       err = hist_entry__tui_annotate(he, evsel->idx,
                                                       timer, arg, delay_secs);
                        he->used = false;
                        ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries);
index 295e366..13aa64e 100644 (file)
@@ -14,6 +14,9 @@ void ui_progress__update(u64 curr, u64 total, const char *title)
        if (use_browser <= 0)
                return;
 
+       if (total == 0)
+               return;
+
        ui__refresh_dimensions(true);
        pthread_mutex_lock(&ui__lock);
        y = SLtt_Screen_Rows / 2 - 2;
index e16bf9a..d76d1c0 100644 (file)
@@ -1,5 +1,8 @@
 /*
- * GIT - The information manager from hell
+ * usage.c
+ *
+ * Various reporting routines.
+ * Originally copied from GIT source.
  *
  * Copyright (C) Linus Torvalds, 2005
  */
index 0128906..37be34d 100644 (file)
@@ -245,4 +245,15 @@ int readn(int fd, void *buf, size_t size);
 #define _STR(x) #x
 #define STR(x) _STR(x)
 
+/*
+ *  Determine whether some value is a power of two, where zero is
+ * *not* considered a power of two.
+ */
+
+static inline __attribute__((const))
+bool is_power_of_2(unsigned long n)
+{
+       return (n != 0 && ((n & (n - 1)) == 0));
+}
+
 #endif
index bdd3347..697c8b4 100644 (file)
@@ -32,6 +32,7 @@ void perf_read_values_destroy(struct perf_read_values *values)
 
        for (i = 0; i < values->threads; i++)
                free(values->value[i]);
+       free(values->value);
        free(values->pid);
        free(values->tid);
        free(values->counterrawid);