Merge branch 'for-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
diff --combined Documentation/cgroup-v2.txt

index 8f1329a,6039d41..bdc6773
--- 1/Documentation/cgroup-v2.txt
--- 2/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@@ -132,6 -132,12 +132,12 @@@ strongly discouraged for production use
   the hierarchies and controller associations before starting using the
   controllers after system boot.
   
+ During transition to v2, system management software might still
+ automount the v1 cgroup filesystem and so hijack all controllers
+ during boot, before manual intervention is possible. To make testing
+ and experimenting easier, the kernel parameter cgroup_no_v1= allows
+ disabling controllers in v1 and make them always available in v2.
+ 
   
   2-2. Organizing Processes
   
@@@ -843,19 -849,6 +849,19 @@@ PAGE_SIZE multiple when read back
                 Amount of memory used to cache filesystem data,
                 including tmpfs and shared memory.
   
+ +        kernel_stack
+ +
+ +              Amount of memory allocated to kernel stacks.
+ +
+ +        slab
+ +
+ +              Amount of memory used for storing in-kernel data
+ +              structures.
+ +
+ +        sock
+ +
+ +              Amount of memory used in network transmission buffers
+ +
           file_mapped
   
                 Amount of cached filesystem data mapped with mmap()
@@@ -880,16 -873,6 +886,16 @@@
                 on the internal memory management lists used by the
                 page reclaim algorithm
   
+ +        slab_reclaimable
+ +
+ +              Part of "slab" that might be reclaimed, such as
+ +              dentries and inodes.
+ +
+ +        slab_unreclaimable
+ +
+ +              Part of "slab" that cannot be reclaimed on memory
+ +              pressure.
+ +
           pgfault
   
                 Total number of page faults incurred
@@@ -915,7 -898,7 +921,7 @@@
         limit, anonymous meomry of the cgroup will not be swapped out.
   
   
- 5-2-2. General Usage
+ 5-2-2. Usage Guidelines
   
   "memory.high" is the main mechanism to control memory usage.
   Over-committing on high limit (sum of high limits > available memory)
@@@ -1387,12 -1370,6 +1393,12 @@@ system than killing the group.  Otherwi
   limit this type of spillover and ultimately contain buggy or even
   malicious applications.
   
+ +Setting the original memory.limit_in_bytes below the current usage was
+ +subject to a race condition, where concurrent charges could cause the
+ +limit setting to fail. memory.max on the other hand will first set the
+ +limit to prevent new charges, and then reclaim and OOM kill until the
+ +new limit is met - or the task writing to memory.max is killed.
+ +
   The combined memory+swap accounting and limiting is replaced by real
   control over swap space.
   
diff --combined Documentation/kernel-parameters.txt

index 0ee46a8,0d962a1..eef242e
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -193,12 -193,6 +193,12 @@@ bytes respectively. Such letter suffixe
                         (e.g. thinkpad_acpi, sony_acpi, etc.) instead
                         of the ACPI video.ko driver.
   
+ +      acpi_force_32bit_fadt_addr
+ +                      force FADT to use 32 bit addresses rather than the
+ +                      64 bit X_* addresses. Some firmware have broken 64
+ +                      bit addresses for force ACPI ignore these and use
+ +                      the older legacy 32 bit addresses.
+ +
         acpica_no_return_repair [HW, ACPI]
                         Disable AML predefined validation mechanism
                         This mechanism can repair the evaluation result to make
@@@ -614,6 -608,11 +614,11 @@@
                         cut the overhead, others just disable the usage. So
                         only cgroup_disable=memory is actually worthy}
   
+       cgroup_no_v1=   [KNL] Disable one, multiple, all cgroup controllers in v1
+                       Format: { controller[,controller...] | "all" }
+                       Like cgroup_disable, but only applies to cgroup v1;
+                       the blacklisted controllers remain available in cgroup2.
+ 
         cgroup.memory=  [KNL] Pass options to the cgroup memory controller.
                         Format: <string>
                         nosocket -- Disable socket memory accounting.
@@@ -672,7 -671,7 +677,7 @@@
   
         clearcpuid=BITNUM [X86]
                         Disable CPUID feature X for the kernel. See
- -                      arch/x86/include/asm/cpufeature.h for the valid bit
+ +                      arch/x86/include/asm/cpufeatures.h for the valid bit
                         numbers. Note the Linux specific bits are not necessarily
                         stable over kernel options, but the vendor specific
                         ones should be.
@@@ -1064,12 -1063,6 +1069,12 @@@
                         A valid base address must be provided, and the serial
                         port must already be setup and configured.
   
+ +              armada3700_uart,<addr>
+ +                      Start an early, polled-mode console on the
+ +                      Armada 3700 serial port at the specified
+ +                      address. The serial port must already be setup
+ +                      and configured. Options are not yet supported.
+ +
         earlyprintk=    [X86,SH,BLACKFIN,ARM,M68k]
                         earlyprintk=vga
                         earlyprintk=efi
@@@ -1466,41 -1459,6 +1471,41 @@@
                         In such case C2/C3 won't be used again.
                         idle=nomwait: Disable mwait for CPU C-states
   
+ +      ieee754=        [MIPS] Select IEEE Std 754 conformance mode
+ +                      Format: { strict | legacy | 2008 | relaxed }
+ +                      Default: strict
+ +
+ +                      Choose which programs will be accepted for execution
+ +                      based on the IEEE 754 NaN encoding(s) supported by
+ +                      the FPU and the NaN encoding requested with the value
+ +                      of an ELF file header flag individually set by each
+ +                      binary.  Hardware implementations are permitted to
+ +                      support either or both of the legacy and the 2008 NaN
+ +                      encoding mode.
+ +
+ +                      Available settings are as follows:
+ +                      strict  accept binaries that request a NaN encoding
+ +                              supported by the FPU
+ +                      legacy  only accept legacy-NaN binaries, if supported
+ +                              by the FPU
+ +                      2008    only accept 2008-NaN binaries, if supported
+ +                              by the FPU
+ +                      relaxed accept any binaries regardless of whether
+ +                              supported by the FPU
+ +
+ +                      The FPU emulator is always able to support both NaN
+ +                      encodings, so if no FPU hardware is present or it has
+ +                      been disabled with 'nofpu', then the settings of
+ +                      'legacy' and '2008' strap the emulator accordingly,
+ +                      'relaxed' straps the emulator for both legacy-NaN and
+ +                      2008-NaN, whereas 'strict' enables legacy-NaN only on
+ +                      legacy processors and both NaN encodings on MIPS32 or
+ +                      MIPS64 CPUs.
+ +
+ +                      The setting for ABS.fmt/NEG.fmt instruction execution
+ +                      mode generally follows that for the NaN encoding,
+ +                      except where unsupported by hardware.
+ +
         ignore_loglevel [KNL]
                         Ignore loglevel setting - this will print /all/
                         kernel messages to the console. Useful for debugging.
@@@ -1508,11 -1466,6 +1513,11 @@@
                         could change it dynamically, usually by
                         /sys/module/printk/parameters/ignore_loglevel.
   
+ +      ignore_rlimit_data
+ +                      Ignore RLIMIT_DATA setting for data mappings,
+ +                      print warning at first misuse.  Can be changed via
+ +                      /sys/module/kernel/parameters/ignore_rlimit_data.
+ +
         ihash_entries=  [KNL]
                         Set number of hash buckets for inode cache.
   
@@@ -1699,15 -1652,6 +1704,15 @@@
         ip=             [IP_PNP]
                         See Documentation/filesystems/nfs/nfsroot.txt.
   
+ +      irqaffinity=    [SMP] Set the default irq affinity mask
+ +                      Format:
+ +                      <cpu number>,...,<cpu number>
+ +                      or
+ +                      <cpu number>-<cpu number>
+ +                      (must be a positive range in ascending order)
+ +                      or a mixture
+ +                      <cpu number>,...,<cpu number>-<cpu number>
+ +
         irqfixup        [HW]
                         When an interrupt is not handled search all handlers
                         for it. Intended to get systems with badly broken
@@@ -1771,9 -1715,7 +1776,9 @@@
   
         keepinitrd      [HW,ARM]
   
- -      kernelcore=nn[KMG]      [KNL,X86,IA-64,PPC] This parameter
+ +      kernelcore=     [KNL,X86,IA-64,PPC]
+ +                      Format: nn[KMGTPE] | "mirror"
+ +                      This parameter
                         specifies the amount of memory usable by the kernel
                         for non-movable allocations.  The requested amount is
                         spread evenly throughout all nodes in the system. The
@@@ -1789,14 -1731,6 +1794,14 @@@
                         use the HighMem zone if it exists, and the Normal
                         zone if it does not.
   
+ +                      Instead of specifying the amount of memory (nn[KMGTPE]),
+ +                      you can specify "mirror" option. In case "mirror"
+ +                      option is specified, mirrored (reliable) memory is used
+ +                      for non-movable allocations and remaining memory is used
+ +                      for Movable pages. nn[KMGTPE] and "mirror" are exclusive,
+ +                      so you can NOT specify nn[KMGTPE] and "mirror" at the same
+ +                      time.
+ +
         kgdbdbgp=       [KGDB,HW] kgdb over EHCI usb debug port.
                         Format: <Controller#>[,poll interval]
                         The controller # is the number of the ehci usb debug
@@@ -2597,8 -2531,6 +2602,8 @@@
   
         nointroute      [IA-64]
   
+ +      noinvpcid       [X86] Disable the INVPCID cpu feature.
+ +
         nojitter        [IA-64] Disables jitter checking for ITC timers.
   
         no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
@@@ -2754,11 -2686,6 +2759,11 @@@
                         we can turn it on.
                         on: enable the feature
   
+ +      page_poison=    [KNL] Boot-time parameter changing the state of
+ +                      poisoning on the buddy allocator.
+ +                      off: turn off poisoning
+ +                      on: turn on poisoning
+ +
         panic=          [KNL] Kernel behaviour on panic: delay <timeout>
                         timeout > 0: seconds before rebooting
                         timeout = 0: wait forever
@@@ -3529,16 -3456,6 +3534,16 @@@
   
         ro              [KNL] Mount root device read-only on boot
   
+ +      rodata=         [KNL]
+ +              on      Mark read-only kernel memory as read-only (default).
+ +              off     Leave read-only kernel memory writable for debugging.
+ +
+ +      rockchip.usb_uart
+ +                      Enable the uart passthrough on the designated usb port
+ +                      on Rockchip SoCs. When active, the signals of the
+ +                      debug-uart get routed to the D+ and D- pins of the usb
+ +                      port and the regular usb controller gets disabled.
+ +
         root=           [KNL] Root filesystem
                         See name_to_dev_t comment in init/do_mounts.c.
   
@@@ -3576,11 -3493,6 +3581,11 @@@
   
         sched_debug     [KNL] Enables verbose scheduler debug messages.
   
+ +      schedstats=     [KNL,X86] Enable or disable scheduled statistics.
+ +                      Allowed values are enable and disable. This feature
+ +                      incurs a small amount of overhead in the scheduler
+ +                      but is useful for debugging and performance tuning.
+ +
         skew_tick=      [KNL] Offset the periodic timer tick per cpu to mitigate
                         xtime_lock contention on larger systems, and/or RCU lock
                         contention on all systems with CONFIG_MAXSMP set.
@@@ -4288,17 -4200,6 +4293,17 @@@
                         The default value of this parameter is determined by
                         the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
   
+ +      workqueue.debug_force_rr_cpu
+ +                      Workqueue used to implicitly guarantee that work
+ +                      items queued without explicit CPU specified are put
+ +                      on the local CPU.  This guarantee is no longer true
+ +                      and while local CPU is still preferred work items
+ +                      may be put on foreign CPUs.  This debug option
+ +                      forces round-robin CPU selection to flush out
+ +                      usages which depend on the now broken guarantee.
+ +                      When enabled, memory and cache locality will be
+ +                      impacted.
+ +
         x2apic_phys     [X86-64,APIC] Use x2apic physical mode instead of
                         default x2apic cluster mode on platforms
                         supporting x2apic.
diff --combined init/Kconfig

index 2d70c8c,9fefb8e..e0d2616
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -1047,10 -1047,10 +1047,10 @@@ config CGROUP_PID
           is fairly trivial to reach PID exhaustion before you reach even a
           conservative kmemcg limit. As a result, it is possible to grind a
           system to halt without being limited by other cgroup policies. The
-         PIDs cgroup subsystem is designed to stop this from happening.
+         PIDs controller is designed to stop this from happening.
   
           It should be noted that organisational operations (such as attaching
-         to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+         to a cgroup hierarchy will *not* be blocked by the PIDs controller),
           since the PIDs limit only affects a process's ability to fork, not to
           attach to a cgroup.
   
@@@ -1420,28 -1420,6 +1420,28 @@@ config KALLSYMS_AL
   
            Say N unless you really need all symbols.
   
+ +config KALLSYMS_ABSOLUTE_PERCPU
+ +      bool
+ +      default X86_64 && SMP
+ +
+ +config KALLSYMS_BASE_RELATIVE
+ +      bool
+ +      depends on KALLSYMS
+ +      default !IA64 && !(TILE && 64BIT)
+ +      help
+ +        Instead of emitting them as absolute values in the native word size,
+ +        emit the symbol references in the kallsyms table as 32-bit entries,
+ +        each containing a relative value in the range [base, base + U32_MAX]
+ +        or, when KALLSYMS_ABSOLUTE_PERCPU is in effect, each containing either
+ +        an absolute value in the range [0, S32_MAX] or a relative value in the
+ +        range [base, base + S32_MAX], where base is the lowest relative symbol
+ +        address encountered in the image.
+ +
+ +        On 64-bit builds, this reduces the size of the address table by 50%,
+ +        but more importantly, it results in entries whose values are build
+ +        time constants, and no relocation pass is required at runtime to fix
+ +        up the entries based on the runtime load address of the kernel.
+ +
   config PRINTK
         default y
         bool "Enable support for printk" if EXPERT
@@@ -1779,9 -1757,9 +1779,9 @@@ config SYSTEM_DATA_VERIFICATIO
         select SYSTEM_TRUSTED_KEYRING
         select KEYS
         select CRYPTO
+ +      select CRYPTO_RSA
         select ASYMMETRIC_KEY_TYPE
         select ASYMMETRIC_PUBLIC_KEY_SUBTYPE
- -      select PUBLIC_KEY_ALGO_RSA
         select ASN1
         select OID_REGISTRY
         select X509_CERTIFICATE_PARSER
diff --combined kernel/sched/core.c

index 4edecc1,0f5abc6..4ee3ce7
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -26,7 -26,6 +26,7 @@@
    *              Thomas Gleixner, Mike Kravetz
    */
   
+ +#include <linux/kasan.h>
   #include <linux/mm.h>
   #include <linux/module.h>
   #include <linux/nmi.h>
@@@ -67,10 -66,12 +67,10 @@@
   #include <linux/pagemap.h>
   #include <linux/hrtimer.h>
   #include <linux/tick.h>
- -#include <linux/debugfs.h>
   #include <linux/ctype.h>
   #include <linux/ftrace.h>
   #include <linux/slab.h>
   #include <linux/init_task.h>
- -#include <linux/binfmts.h>
   #include <linux/context_tracking.h>
   #include <linux/compiler.h>
   
@@@ -123,6 -124,138 +123,6 @@@ const_debug unsigned int sysctl_sched_f
   
   #undef SCHED_FEAT
   
- -#ifdef CONFIG_SCHED_DEBUG
- -#define SCHED_FEAT(name, enabled)     \
- -      #name ,
- -
- -static const char * const sched_feat_names[] = {
- -#include "features.h"
- -};
- -
- -#undef SCHED_FEAT
- -
- -static int sched_feat_show(struct seq_file *m, void *v)
- -{
- -      int i;
- -
- -      for (i = 0; i < __SCHED_FEAT_NR; i++) {
- -              if (!(sysctl_sched_features & (1UL << i)))
- -                      seq_puts(m, "NO_");
- -              seq_printf(m, "%s ", sched_feat_names[i]);
- -      }
- -      seq_puts(m, "\n");
- -
- -      return 0;
- -}
- -
- -#ifdef HAVE_JUMP_LABEL
- -
- -#define jump_label_key__true  STATIC_KEY_INIT_TRUE
- -#define jump_label_key__false STATIC_KEY_INIT_FALSE
- -
- -#define SCHED_FEAT(name, enabled)     \
- -      jump_label_key__##enabled ,
- -
- -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
- -#include "features.h"
- -};
- -
- -#undef SCHED_FEAT
- -
- -static void sched_feat_disable(int i)
- -{
- -      static_key_disable(&sched_feat_keys[i]);
- -}
- -
- -static void sched_feat_enable(int i)
- -{
- -      static_key_enable(&sched_feat_keys[i]);
- -}
- -#else
- -static void sched_feat_disable(int i) { };
- -static void sched_feat_enable(int i) { };
- -#endif /* HAVE_JUMP_LABEL */
- -
- -static int sched_feat_set(char *cmp)
- -{
- -      int i;
- -      int neg = 0;
- -
- -      if (strncmp(cmp, "NO_", 3) == 0) {
- -              neg = 1;
- -              cmp += 3;
- -      }
- -
- -      for (i = 0; i < __SCHED_FEAT_NR; i++) {
- -              if (strcmp(cmp, sched_feat_names[i]) == 0) {
- -                      if (neg) {
- -                              sysctl_sched_features &= ~(1UL << i);
- -                              sched_feat_disable(i);
- -                      } else {
- -                              sysctl_sched_features |= (1UL << i);
- -                              sched_feat_enable(i);
- -                      }
- -                      break;
- -              }
- -      }
- -
- -      return i;
- -}
- -
- -static ssize_t
- -sched_feat_write(struct file *filp, const char __user *ubuf,
- -              size_t cnt, loff_t *ppos)
- -{
- -      char buf[64];
- -      char *cmp;
- -      int i;
- -      struct inode *inode;
- -
- -      if (cnt > 63)
- -              cnt = 63;
- -
- -      if (copy_from_user(&buf, ubuf, cnt))
- -              return -EFAULT;
- -
- -      buf[cnt] = 0;
- -      cmp = strstrip(buf);
- -
- -      /* Ensure the static_key remains in a consistent state */
- -      inode = file_inode(filp);
- -      mutex_lock(&inode->i_mutex);
- -      i = sched_feat_set(cmp);
- -      mutex_unlock(&inode->i_mutex);
- -      if (i == __SCHED_FEAT_NR)
- -              return -EINVAL;
- -
- -      *ppos += cnt;
- -
- -      return cnt;
- -}
- -
- -static int sched_feat_open(struct inode *inode, struct file *filp)
- -{
- -      return single_open(filp, sched_feat_show, NULL);
- -}
- -
- -static const struct file_operations sched_feat_fops = {
- -      .open           = sched_feat_open,
- -      .write          = sched_feat_write,
- -      .read           = seq_read,
- -      .llseek         = seq_lseek,
- -      .release        = single_release,
- -};
- -
- -static __init int sched_init_debug(void)
- -{
- -      debugfs_create_file("sched_features", 0644, NULL, NULL,
- -                      &sched_feat_fops);
- -
- -      return 0;
- -}
- -late_initcall(sched_init_debug);
- -#endif /* CONFIG_SCHED_DEBUG */
- -
   /*
    * Number of tasks to iterate in a single balance run.
    * Limited because this is done with IRQs disabled.
@@@ -320,6 -453,20 +320,6 @@@ static inline void init_hrtick(void
   }
   #endif        /* CONFIG_SCHED_HRTICK */
   
- -/*
- - * cmpxchg based fetch_or, macro so it works for different integer types
- - */
- -#define fetch_or(ptr, val)                                            \
- -({    typeof(*(ptr)) __old, __val = *(ptr);                           \
- -      for (;;) {                                                      \
- -              __old = cmpxchg((ptr), __val, __val | (val));           \
- -              if (__old == __val)                                     \
- -                      break;                                          \
- -              __val = __old;                                          \
- -      }                                                               \
- -      __old;                                                          \
- -})
- -
   #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
   /*
    * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@@ -568,36 -715,31 +568,36 @@@ static inline bool got_nohz_idle_kick(v
   #endif /* CONFIG_NO_HZ_COMMON */
   
   #ifdef CONFIG_NO_HZ_FULL
- -bool sched_can_stop_tick(void)
+ +bool sched_can_stop_tick(struct rq *rq)
   {
+ +      int fifo_nr_running;
+ +
+ +      /* Deadline tasks, even if single, need the tick */
+ +      if (rq->dl.dl_nr_running)
+ +              return false;
+ +
         /*
- -       * FIFO realtime policy runs the highest priority task. Other runnable
- -       * tasks are of a lower priority. The scheduler tick does nothing.
+ +       * FIFO realtime policy runs the highest priority task (after DEADLINE).
+ +       * Other runnable tasks are of a lower priority. The scheduler tick
+ +       * isn't needed.
          */
- -      if (current->policy == SCHED_FIFO)
+ +      fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ +      if (fifo_nr_running)
                 return true;
   
         /*
          * Round-robin realtime tasks time slice with other tasks at the same
- -       * realtime priority. Is this task the only one at this priority?
+ +       * realtime priority.
          */
- -      if (current->policy == SCHED_RR) {
- -              struct sched_rt_entity *rt_se = &current->rt;
- -
- -              return list_is_singular(&rt_se->run_list);
+ +      if (rq->rt.rr_nr_running) {
+ +              if (rq->rt.rr_nr_running == 1)
+ +                      return true;
+ +              else
+ +                      return false;
         }
   
- -      /*
- -       * More than one running task need preemption.
- -       * nr_running update is assumed to be visible
- -       * after IPI is sent from wakers.
- -       */
- -      if (this_rq()->nr_running > 1)
+ +      /* Normal multitasking need periodic preemption checks */
+ +      if (rq->cfs.nr_running > 1)
                 return false;
   
         return true;
@@@ -1951,8 -2093,7 +1951,8 @@@ try_to_wake_up(struct task_struct *p, u
   
         ttwu_queue(p, cpu);
   stat:
- -      ttwu_stat(p, cpu, wake_flags);
+ +      if (schedstat_enabled())
+ +              ttwu_stat(p, cpu, wake_flags);
   out:
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   
@@@ -2000,8 -2141,7 +2000,8 @@@ static void try_to_wake_up_local(struc
                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
   
         ttwu_do_wakeup(rq, p, 0);
- -      ttwu_stat(p, smp_processor_id(), 0);
+ +      if (schedstat_enabled())
+ +              ttwu_stat(p, smp_processor_id(), 0);
   out:
         raw_spin_unlock(&p->pi_lock);
   }
@@@ -2043,6 -2183,7 +2043,6 @@@ void __dl_clear_params(struct task_stru
         dl_se->dl_bw = 0;
   
         dl_se->dl_throttled = 0;
- -      dl_se->dl_new = 1;
         dl_se->dl_yielded = 0;
   }
   
@@@ -2069,7 -2210,6 +2069,7 @@@ static void __sched_fork(unsigned long 
   #endif
   
   #ifdef CONFIG_SCHEDSTATS
+ +      /* Even if schedstat is disabled, there should not be garbage */
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   #endif
   
@@@ -2078,10 -2218,6 +2078,10 @@@
         __dl_clear_params(p);
   
         INIT_LIST_HEAD(&p->rt.run_list);
+ +      p->rt.timeout           = 0;
+ +      p->rt.time_slice        = sched_rr_timeslice;
+ +      p->rt.on_rq             = 0;
+ +      p->rt.on_list           = 0;
   
   #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
@@@ -2145,69 -2281,6 +2145,69 @@@ int sysctl_numa_balancing(struct ctl_ta
   #endif
   #endif
   
+ +DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+ +
+ +#ifdef CONFIG_SCHEDSTATS
+ +static void set_schedstats(bool enabled)
+ +{
+ +      if (enabled)
+ +              static_branch_enable(&sched_schedstats);
+ +      else
+ +              static_branch_disable(&sched_schedstats);
+ +}
+ +
+ +void force_schedstat_enabled(void)
+ +{
+ +      if (!schedstat_enabled()) {
+ +              pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+ +              static_branch_enable(&sched_schedstats);
+ +      }
+ +}
+ +
+ +static int __init setup_schedstats(char *str)
+ +{
+ +      int ret = 0;
+ +      if (!str)
+ +              goto out;
+ +
+ +      if (!strcmp(str, "enable")) {
+ +              set_schedstats(true);
+ +              ret = 1;
+ +      } else if (!strcmp(str, "disable")) {
+ +              set_schedstats(false);
+ +              ret = 1;
+ +      }
+ +out:
+ +      if (!ret)
+ +              pr_warn("Unable to parse schedstats=\n");
+ +
+ +      return ret;
+ +}
+ +__setup("schedstats=", setup_schedstats);
+ +
+ +#ifdef CONFIG_PROC_SYSCTL
+ +int sysctl_schedstats(struct ctl_table *table, int write,
+ +                       void __user *buffer, size_t *lenp, loff_t *ppos)
+ +{
+ +      struct ctl_table t;
+ +      int err;
+ +      int state = static_branch_likely(&sched_schedstats);
+ +
+ +      if (write && !capable(CAP_SYS_ADMIN))
+ +              return -EPERM;
+ +
+ +      t = *table;
+ +      t.data = &state;
+ +      err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ +      if (err < 0)
+ +              return err;
+ +      if (write)
+ +              set_schedstats(state);
+ +      return err;
+ +}
+ +#endif
+ +#endif
+ +
   /*
    * fork()/clone()-time setup:
    */
@@@ -2937,6 -3010,16 +2937,6 @@@ u64 scheduler_tick_max_deferment(void
   }
   #endif
   
- -notrace unsigned long get_parent_ip(unsigned long addr)
- -{
- -      if (in_lock_functions(addr)) {
- -              addr = CALLER_ADDR2;
- -              if (in_lock_functions(addr))
- -                      addr = CALLER_ADDR3;
- -      }
- -      return addr;
- -}
- -
   #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
   
@@@ -2958,7 -3041,7 +2958,7 @@@ void preempt_count_add(int val
                                 PREEMPT_MASK - 10);
   #endif
         if (preempt_count() == val) {
- -              unsigned long ip = get_parent_ip(CALLER_ADDR1);
+ +              unsigned long ip = get_lock_parent_ip();
   #ifdef CONFIG_DEBUG_PREEMPT
                 current->preempt_disable_ip = ip;
   #endif
@@@ -2985,7 -3068,7 +2985,7 @@@ void preempt_count_sub(int val
   #endif
   
         if (preempt_count() == val)
- -              trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ +              trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
         __preempt_count_sub(val);
   }
   EXPORT_SYMBOL(preempt_count_sub);
@@@ -3174,7 -3257,7 +3174,7 @@@ static void __sched notrace __schedule(
                         if (prev->flags & PF_WQ_WORKER) {
                                 struct task_struct *to_wakeup;
   
- -                              to_wakeup = wq_worker_sleeping(prev, cpu);
+ +                              to_wakeup = wq_worker_sleeping(prev);
                                 if (to_wakeup)
                                         try_to_wake_up_local(to_wakeup);
                         }
@@@ -3197,6 -3280,7 +3197,6 @@@
   
                 trace_sched_switch(preempt, prev, next);
                 rq = context_switch(rq, prev, next); /* unlocks the rq */
- -              cpu = cpu_of(rq);
         } else {
                 lockdep_unpin_lock(&rq->lock);
                 raw_spin_unlock_irq(&rq->lock);
@@@ -3382,7 -3466,7 +3382,7 @@@ EXPORT_SYMBOL(default_wake_function)
    */
   void rt_mutex_setprio(struct task_struct *p, int prio)
   {
- -      int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+ +      int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
         struct rq *rq;
         const struct sched_class *prev_class;
   
@@@ -3410,15 -3494,11 +3410,15 @@@
   
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
+ +
+ +      if (oldprio == prio)
+ +              queue_flag &= ~DEQUEUE_MOVE;
+ +
         prev_class = p->sched_class;
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
- -              dequeue_task(rq, p, DEQUEUE_SAVE);
+ +              dequeue_task(rq, p, queue_flag);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -3436,7 -3516,7 +3436,7 @@@
                 if (!dl_prio(p->normal_prio) ||
                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                         p->dl.dl_boosted = 1;
- -                      enqueue_flag |= ENQUEUE_REPLENISH;
+ +                      queue_flag |= ENQUEUE_REPLENISH;
                 } else
                         p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
@@@ -3444,7 -3524,7 +3444,7 @@@
                 if (dl_prio(oldprio))
                         p->dl.dl_boosted = 0;
                 if (oldprio < prio)
- -                      enqueue_flag |= ENQUEUE_HEAD;
+ +                      queue_flag |= ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
         } else {
                 if (dl_prio(oldprio))
@@@ -3459,7 -3539,7 +3459,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued)
- -              enqueue_task(rq, p, enqueue_flag);
+ +              enqueue_task(rq, p, queue_flag);
   
         check_class_changed(rq, p, prev_class, oldprio);
   out_unlock:
@@@ -3815,7 -3895,6 +3815,7 @@@ static int __sched_setscheduler(struct 
         const struct sched_class *prev_class;
         struct rq *rq;
         int reset_on_fork;
+ +      int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
   
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
@@@ -3998,14 -4077,17 +3998,14 @@@ change
                  * itself.
                  */
                 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- -              if (new_effective_prio == oldprio) {
- -                      __setscheduler_params(p, attr);
- -                      task_rq_unlock(rq, p, &flags);
- -                      return 0;
- -              }
+ +              if (new_effective_prio == oldprio)
+ +                      queue_flags &= ~DEQUEUE_MOVE;
         }
   
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
         if (queued)
- -              dequeue_task(rq, p, DEQUEUE_SAVE);
+ +              dequeue_task(rq, p, queue_flags);
         if (running)
                 put_prev_task(rq, p);
   
@@@ -4015,14 -4097,15 +4015,14 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (queued) {
- -              int enqueue_flags = ENQUEUE_RESTORE;
                 /*
                  * We enqueue to tail when the priority of a task is
                  * increased (user space view).
                  */
- -              if (oldprio <= p->prio)
- -                      enqueue_flags |= ENQUEUE_HEAD;
+ +              if (oldprio < p->prio)
+ +                      queue_flags |= ENQUEUE_HEAD;
   
- -              enqueue_task(rq, p, enqueue_flags);
+ +              enqueue_task(rq, p, queue_flags);
         }
   
         check_class_changed(rq, p, prev_class, oldprio);
@@@ -5013,8 -5096,6 +5013,8 @@@ void init_idle(struct task_struct *idle
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
   
+ +      kasan_unpoison_task_stack(idle);
+ +
   #ifdef CONFIG_SMP
         /*
          * Its possible that init_idle() gets called multiple times on a task,
@@@ -5324,6 -5405,183 +5324,6 @@@ static void migrate_tasks(struct rq *de
   }
   #endif /* CONFIG_HOTPLUG_CPU */
   
- -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
- -
- -static struct ctl_table sd_ctl_dir[] = {
- -      {
- -              .procname       = "sched_domain",
- -              .mode           = 0555,
- -      },
- -      {}
- -};
- -
- -static struct ctl_table sd_ctl_root[] = {
- -      {
- -              .procname       = "kernel",
- -              .mode           = 0555,
- -              .child          = sd_ctl_dir,
- -      },
- -      {}
- -};
- -
- -static struct ctl_table *sd_alloc_ctl_entry(int n)
- -{
- -      struct ctl_table *entry =
- -              kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
- -
- -      return entry;
- -}
- -
- -static void sd_free_ctl_entry(struct ctl_table **tablep)
- -{
- -      struct ctl_table *entry;
- -
- -      /*
- -       * In the intermediate directories, both the child directory and
- -       * procname are dynamically allocated and could fail but the mode
- -       * will always be set. In the lowest directory the names are
- -       * static strings and all have proc handlers.
- -       */
- -      for (entry = *tablep; entry->mode; entry++) {
- -              if (entry->child)
- -                      sd_free_ctl_entry(&entry->child);
- -              if (entry->proc_handler == NULL)
- -                      kfree(entry->procname);
- -      }
- -
- -      kfree(*tablep);
- -      *tablep = NULL;
- -}
- -
- -static int min_load_idx = 0;
- -static int max_load_idx = CPU_LOAD_IDX_MAX-1;
- -
- -static void
- -set_table_entry(struct ctl_table *entry,
- -              const char *procname, void *data, int maxlen,
- -              umode_t mode, proc_handler *proc_handler,
- -              bool load_idx)
- -{
- -      entry->procname = procname;
- -      entry->data = data;
- -      entry->maxlen = maxlen;
- -      entry->mode = mode;
- -      entry->proc_handler = proc_handler;
- -
- -      if (load_idx) {
- -              entry->extra1 = &min_load_idx;
- -              entry->extra2 = &max_load_idx;
- -      }
- -}
- -
- -static struct ctl_table *
- -sd_alloc_ctl_domain_table(struct sched_domain *sd)
- -{
- -      struct ctl_table *table = sd_alloc_ctl_entry(14);
- -
- -      if (table == NULL)
- -              return NULL;
- -
- -      set_table_entry(&table[0], "min_interval", &sd->min_interval,
- -              sizeof(long), 0644, proc_doulongvec_minmax, false);
- -      set_table_entry(&table[1], "max_interval", &sd->max_interval,
- -              sizeof(long), 0644, proc_doulongvec_minmax, false);
- -      set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- -              sizeof(int), 0644, proc_dointvec_minmax, true);
- -      set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- -              sizeof(int), 0644, proc_dointvec_minmax, true);
- -      set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- -              sizeof(int), 0644, proc_dointvec_minmax, true);
- -      set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- -              sizeof(int), 0644, proc_dointvec_minmax, true);
- -      set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- -              sizeof(int), 0644, proc_dointvec_minmax, true);
- -      set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- -              sizeof(int), 0644, proc_dointvec_minmax, false);
- -      set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- -              sizeof(int), 0644, proc_dointvec_minmax, false);
- -      set_table_entry(&table[9], "cache_nice_tries",
- -              &sd->cache_nice_tries,
- -              sizeof(int), 0644, proc_dointvec_minmax, false);
- -      set_table_entry(&table[10], "flags", &sd->flags,
- -              sizeof(int), 0644, proc_dointvec_minmax, false);
- -      set_table_entry(&table[11], "max_newidle_lb_cost",
- -              &sd->max_newidle_lb_cost,
- -              sizeof(long), 0644, proc_doulongvec_minmax, false);
- -      set_table_entry(&table[12], "name", sd->name,
- -              CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- -      /* &table[13] is terminator */
- -
- -      return table;
- -}
- -
- -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
- -{
- -      struct ctl_table *entry, *table;
- -      struct sched_domain *sd;
- -      int domain_num = 0, i;
- -      char buf[32];
- -
- -      for_each_domain(cpu, sd)
- -              domain_num++;
- -      entry = table = sd_alloc_ctl_entry(domain_num + 1);
- -      if (table == NULL)
- -              return NULL;
- -
- -      i = 0;
- -      for_each_domain(cpu, sd) {
- -              snprintf(buf, 32, "domain%d", i);
- -              entry->procname = kstrdup(buf, GFP_KERNEL);
- -              entry->mode = 0555;
- -              entry->child = sd_alloc_ctl_domain_table(sd);
- -              entry++;
- -              i++;
- -      }
- -      return table;
- -}
- -
- -static struct ctl_table_header *sd_sysctl_header;
- -static void register_sched_domain_sysctl(void)
- -{
- -      int i, cpu_num = num_possible_cpus();
- -      struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
- -      char buf[32];
- -
- -      WARN_ON(sd_ctl_dir[0].child);
- -      sd_ctl_dir[0].child = entry;
- -
- -      if (entry == NULL)
- -              return;
- -
- -      for_each_possible_cpu(i) {
- -              snprintf(buf, 32, "cpu%d", i);
- -              entry->procname = kstrdup(buf, GFP_KERNEL);
- -              entry->mode = 0555;
- -              entry->child = sd_alloc_ctl_cpu_table(i);
- -              entry++;
- -      }
- -
- -      WARN_ON(sd_sysctl_header);
- -      sd_sysctl_header = register_sysctl_table(sd_ctl_root);
- -}
- -
- -/* may be called multiple times per register */
- -static void unregister_sched_domain_sysctl(void)
- -{
- -      unregister_sysctl_table(sd_sysctl_header);
- -      sd_sysctl_header = NULL;
- -      if (sd_ctl_dir[0].child)
- -              sd_free_ctl_entry(&sd_ctl_dir[0].child);
- -}
- -#else
- -static void register_sched_domain_sysctl(void)
- -{
- -}
- -static void unregister_sched_domain_sysctl(void)
- -{
- -}
- -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
- -
   static void set_rq_online(struct rq *rq)
   {
         if (!rq->online) {
@@@ -5434,6 -5692,16 +5434,6 @@@ static int sched_cpu_active(struct noti
                 set_cpu_rq_start_time();
                 return NOTIFY_OK;
   
- -      case CPU_ONLINE:
- -              /*
- -               * At this point a starting CPU has marked itself as online via
- -               * set_cpu_online(). But it might not yet have marked itself
- -               * as active, which is essential from here on.
- -               */
- -              set_cpu_active(cpu, true);
- -              stop_machine_unpark(cpu);
- -              return NOTIFY_OK;
- -
         case CPU_DOWN_FAILED:
                 set_cpu_active(cpu, true);
                 return NOTIFY_OK;
@@@ -5905,16 -6173,11 +5905,16 @@@ cpu_attach_domain(struct sched_domain *
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
   {
+ +      int ret;
+ +
         alloc_bootmem_cpumask_var(&cpu_isolated_map);
- -      cpulist_parse(str, cpu_isolated_map);
+ +      ret = cpulist_parse(str, cpu_isolated_map);
+ +      if (ret) {
+ +              pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+ +              return 0;
+ +      }
         return 1;
   }
- -
   __setup("isolcpus=", isolated_cpu_setup);
   
   struct s_data {
@@@ -6577,7 -6840,7 +6577,7 @@@ static void sched_init_numa(void
   
                         sched_domains_numa_masks[i][j] = mask;
   
- -                      for (k = 0; k < nr_node_ids; k++) {
+ +                      for_each_node(k) {
                                 if (node_distance(j, k) > sched_domains_numa_distance[i])
                                         continue;
   
@@@ -7597,9 -7860,11 +7597,9 @@@ void sched_destroy_group(struct task_gr
   void sched_offline_group(struct task_group *tg)
   {
         unsigned long flags;
- -      int i;
   
         /* end participation in shares distribution */
- -      for_each_possible_cpu(i)
- -              unregister_fair_sched_group(tg, i);
+ +      unregister_fair_sched_group(tg);
   
         spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
@@@ -7625,7 -7890,7 +7625,7 @@@ void sched_move_task(struct task_struc
         queued = task_on_rq_queued(tsk);
   
         if (queued)
- -              dequeue_task(rq, tsk, DEQUEUE_SAVE);
+ +              dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
         if (unlikely(running))
                 put_prev_task(rq, tsk);
   
@@@ -7649,7 -7914,7 +7649,7 @@@
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (queued)
- -              enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+ +              enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
   
         task_rq_unlock(rq, tsk, &flags);
   }
@@@ -8441,7 -8706,7 +8441,7 @@@ struct cgroup_subsys cpu_cgrp_subsys = 
         .can_attach     = cpu_cgroup_can_attach,
         .attach         = cpu_cgroup_attach,
         .legacy_cftypes = cpu_files,
-       .early_init     = 1,
+       .early_init     = true,
   };
   
   #endif        /* CONFIG_CGROUP_SCHED */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 19 Mar 2016 03:25:49 +0000 (20:25 -0700)
		1	2
Documentation/cgroup-v2.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history