Merge git://git.infradead.org/users/eparis/audit

[cascardo/linux.git] / include / linux / sched.h
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 0f72548..25f54c7 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
  
  #include <uapi/linux/sched.h>
  
+#include <linux/sched/prio.h>
+
  
  struct sched_param {
         int sched_priority;
@@ -16,6 +18,7 @@ struct sched_param {
  #include <linux/types.h>
  #include <linux/timex.h>
  #include <linux/jiffies.h>
+#include <linux/plist.h>
  #include <linux/rbtree.h>
  #include <linux/thread_info.h>
  #include <linux/cpumask.h>
@@ -26,7 +29,7 @@ struct sched_param {
  
  #include <asm/page.h>
  #include <asm/ptrace.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
  
  #include <linux/smp.h>
  #include <linux/sem.h>
@@ -56,6 +59,70 @@ struct sched_param {
  
  #include <asm/processor.h>
  
+#define SCHED_ATTR_SIZE_VER0   48      /* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ *  - the activation period or minimum instance inter-arrival time;
+ *  - the maximum (or average, depending on the actual scheduling
+ *    discipline) computation time of all instances, a.k.a. runtime;
+ *  - the deadline (relative to the actual activation time) of each
+ *    instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ *  @size              size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy      task's scheduling policy
+ *  @sched_flags       for customizing the scheduler behaviour
+ *  @sched_nice                task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority    task's static priority (SCHED_FIFO/RR)
+ *  @sched_deadline    representative of the task's deadline
+ *  @sched_runtime     representative of the task's runtime
+ *  @sched_period      representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
+ * only user of this new interface. More information about the algorithm
+ * available in the scheduling class file or in Documentation/.
+ */
+struct sched_attr {
+       u32 size;
+
+       u32 sched_policy;
+       u64 sched_flags;
+
+       /* SCHED_NORMAL, SCHED_BATCH */
+       s32 sched_nice;
+
+       /* SCHED_FIFO, SCHED_RR */
+       u32 sched_priority;
+
+       /* SCHED_DEADLINE */
+       u64 sched_runtime;
+       u64 sched_deadline;
+       u64 sched_period;
+};
+
  struct exec_domain;
  struct futex_pi_state;
  struct robust_list_head;
@@ -63,6 +130,11 @@ struct bio_list;
  struct fs_struct;
  struct perf_event_context;
  struct blk_plug;
+struct filename;
+
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
  
  /*
   * List of flags we want to share for kernel threads,
@@ -138,8 +210,9 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
  #define __TASK_STOPPED         4
  #define __TASK_TRACED          8
  /* in tsk->exit_state */
-#define EXIT_ZOMBIE            16
-#define EXIT_DEAD              32
+#define EXIT_DEAD              16
+#define EXIT_ZOMBIE            32
+#define EXIT_TRACE             (EXIT_ZOMBIE | EXIT_DEAD)
  /* in tsk->state again */
  #define TASK_DEAD              64
  #define TASK_WAKEKILL          128
@@ -164,11 +237,10 @@ extern char ___assert_task_state[1 - 2*!!(
  /* get_task_state() */
  #define TASK_REPORT            (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
-                                __TASK_TRACED)
+                                __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
  
  #define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
-#define task_is_dead(task)     ((task)->exit_state != 0)
  #define task_is_stopped_or_traced(task)        \
                         ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  #define task_contributes_to_load(task) \
@@ -227,10 +299,14 @@ extern int runqueue_is_locked(int cpu);
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  extern void nohz_balance_enter_idle(int cpu);
  extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(void);
+extern int get_nohz_timer_target(int pinned);
  #else
  static inline void nohz_balance_enter_idle(int cpu) { }
  static inline void set_cpu_sd_state_idle(void) { }
+static inline int get_nohz_timer_target(int pinned)
+{
+       return smp_processor_id();
+}
  #endif
  
  /*
@@ -327,22 +403,33 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
  static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
  #endif
  
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-extern int get_dumpable(struct mm_struct *mm);
-
  #define SUID_DUMP_DISABLE      0       /* No setuid dumping */
  #define SUID_DUMP_USER         1       /* Dump as user of process */
  #define SUID_DUMP_ROOT         2       /* Dump as root */
  
  /* mm flags */
-/* dumpable bits */
-#define MMF_DUMPABLE      0  /* core dump is permitted */
-#define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
  
+/* for SUID_DUMP_* above */
  #define MMF_DUMPABLE_BITS 2
  #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
  
+extern void set_dumpable(struct mm_struct *mm, int value);
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
+static inline int __get_dumpable(unsigned long mm_flags)
+{
+       return mm_flags & MMF_DUMPABLE_MASK;
+}
+
+static inline int get_dumpable(struct mm_struct *mm)
+{
+       return __get_dumpable(mm->flags);
+}
+
  /* coredump filter bits */
  #define MMF_DUMP_ANON_PRIVATE  2
  #define MMF_DUMP_ANON_SHARED   3
@@ -485,6 +572,7 @@ struct signal_struct {
         atomic_t                sigcnt;
         atomic_t                live;
         int                     nr_threads;
+       struct list_head        thread_head;
  
         wait_queue_head_t       wait_chldexit;  /* for wait4() */
  
@@ -1000,6 +1088,7 @@ struct sched_entity {
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+       int                     depth;
         struct sched_entity     *parent;
         /* rq on which this entity is (to be) queued: */
         struct cfs_rq           *cfs_rq;
@@ -1029,6 +1118,51 @@ struct sched_rt_entity {
  #endif
  };
  
+struct sched_dl_entity {
+       struct rb_node  rb_node;
+
+       /*
+        * Original scheduling parameters. Copied here from sched_attr
+        * during sched_setscheduler2(), they will remain the same until
+        * the next sched_setscheduler2().
+        */
+       u64 dl_runtime;         /* maximum runtime for each instance    */
+       u64 dl_deadline;        /* relative deadline of each instance   */
+       u64 dl_period;          /* separation of two instances (period) */
+       u64 dl_bw;              /* dl_runtime / dl_deadline             */
+
+       /*
+        * Actual scheduling parameters. Initialized with the values above,
+        * they are continously updated during task execution. Note that
+        * the remaining runtime could be < 0 in case we are in overrun.
+        */
+       s64 runtime;            /* remaining runtime for this instance  */
+       u64 deadline;           /* absolute deadline for this instance  */
+       unsigned int flags;     /* specifying the scheduler behaviour   */
+
+       /*
+        * Some bool flags:
+        *
+        * @dl_throttled tells if we exhausted the runtime. If so, the
+        * task has to wait for a replenishment to be performed at the
+        * next firing of dl_timer.
+        *
+        * @dl_new tells if a new instance arrived. If so we must
+        * start executing it with full runtime and reset its absolute
+        * deadline;
+        *
+        * @dl_boosted tells if we are boosted due to DI. If so we are
+        * outside bandwidth enforcement mechanism (but only until we
+        * exit the critical section).
+        */
+       int dl_throttled, dl_new, dl_boosted;
+
+       /*
+        * Bandwidth enforcement timer. Each -deadline task has its
+        * own bandwidth to be enforced, thus we need one timer per task.
+        */
+       struct hrtimer dl_timer;
+};
  
  struct rcu_node;
  
@@ -1065,6 +1199,7 @@ struct task_struct {
  #ifdef CONFIG_CGROUP_SCHED
         struct task_group *sched_task_group;
  #endif
+       struct sched_dl_entity dl;
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         /* list of struct preempt_notifier: */
@@ -1098,12 +1233,16 @@ struct task_struct {
         struct list_head tasks;
  #ifdef CONFIG_SMP
         struct plist_node pushable_tasks;
+       struct rb_node pushable_dl_tasks;
  #endif
  
         struct mm_struct *mm, *active_mm;
  #ifdef CONFIG_COMPAT_BRK
         unsigned brk_randomized:1;
  #endif
+       /* per-thread vma caching */
+       u32 vmacache_seqnum;
+       struct vm_area_struct *vmacache[VMACACHE_SIZE];
  #if defined(SPLIT_RSS_COUNTING)
         struct task_rss_stat    rss_stat;
  #endif
@@ -1116,7 +1255,6 @@ struct task_struct {
         /* Used for emulating ABI behavior of previous Linux versions */
         unsigned int personality;
  
-       unsigned did_exec:1;
         unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
                                  * execve */
         unsigned in_iowait:1;
@@ -1160,6 +1298,7 @@ struct task_struct {
         /* PID/PID hash table linkage. */
         struct pid_link pids[PIDTYPE_MAX];
         struct list_head thread_group;
+       struct list_head thread_node;
  
         struct completion *vfork_done;          /* for vfork() */
         int __user *set_child_tid;              /* CLONE_CHILD_SETTID */
@@ -1249,9 +1388,12 @@ struct task_struct {
  
  #ifdef CONFIG_RT_MUTEXES
         /* PI waiters blocked on a rt_mutex held by this task */
-       struct plist_head pi_waiters;
+       struct rb_root pi_waiters;
+       struct rb_node *pi_waiters_leftmost;
         /* Deadlock detection and priority inheritance handling */
         struct rt_mutex_waiter *pi_blocked_on;
+       /* Top pi_waiters task */
+       struct task_struct *pi_top_task;
  #endif
  
  #ifdef CONFIG_DEBUG_MUTEXES
@@ -1333,6 +1475,9 @@ struct task_struct {
         struct mutex perf_event_mutex;
         struct list_head perf_event_list;
  #endif
+#ifdef CONFIG_DEBUG_PREEMPT
+       unsigned long preempt_disable_ip;
+#endif
  #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;    /* Protected by alloc_lock */
         short il_next;
@@ -1343,9 +1488,10 @@ struct task_struct {
         unsigned int numa_scan_period;
         unsigned int numa_scan_period_max;
         int numa_preferred_nid;
-       int numa_migrate_deferred;
         unsigned long numa_migrate_retry;
         u64 node_stamp;                 /* migration stamp  */
+       u64 last_task_numa_placement;
+       u64 last_sum_exec_runtime;
         struct callback_head numa_work;
  
         struct list_head numa_entry;
@@ -1356,15 +1502,22 @@ struct task_struct {
          * Scheduling placement decisions are made based on the these counts.
          * The values remain static for the duration of a PTE scan
          */
-       unsigned long *numa_faults;
+       unsigned long *numa_faults_memory;
         unsigned long total_numa_faults;
  
         /*
          * numa_faults_buffer records faults per node during the current
-        * scan window. When the scan completes, the counts in numa_faults
-        * decay and these values are copied.
+        * scan window. When the scan completes, the counts in
+        * numa_faults_memory decay and these values are copied.
          */
-       unsigned long *numa_faults_buffer;
+       unsigned long *numa_faults_buffer_memory;
+
+       /*
+        * Track the nodes the process was running on when a NUMA hinting
+        * fault was incurred.
+        */
+       unsigned long *numa_faults_cpu;
+       unsigned long *numa_faults_buffer_cpu;
  
         /*
          * numa_faults_locality tracks if faults recorded during the last
@@ -1469,8 +1622,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
  extern pid_t task_numa_group_id(struct task_struct *p);
  extern void set_numabalancing_state(bool enabled);
  extern void task_numa_free(struct task_struct *p);
-
-extern unsigned int sysctl_numa_balancing_migrate_deferred;
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+                                       int src_nid, int dst_cpu);
  #else
  static inline void task_numa_fault(int last_node, int node, int pages,
                                    int flags)
@@ -1486,6 +1639,11 @@ static inline void set_numabalancing_state(bool enabled)
  static inline void task_numa_free(struct task_struct *p)
  {
  }
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+                               struct page *page, int src_nid, int dst_cpu)
+{
+       return true;
+}
  #endif
  
  static inline struct pid *task_pid(struct task_struct *task)
@@ -1712,7 +1870,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
  #define PF_SPREAD_SLAB 0x02000000      /* Spread some slab caches over cpuset */
  #define PF_NO_SETAFFINITY 0x04000000   /* Userland is not allowed to meddle with cpus_allowed */
  #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
-#define PF_MEMPOLICY   0x10000000      /* Non-default NUMA mempolicy */
  #define PF_MUTEX_TESTER        0x20000000      /* Thread belongs to the rt mutex tester */
  #define PF_FREEZER_SKIP        0x40000000      /* Freezer should not count it as freezable */
  #define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
@@ -1898,7 +2055,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
   * but then during bootup it turns out that sched_clock()
   * is reliable after all:
   */
-extern int sched_clock_stable;
+extern int sched_clock_stable(void);
+extern void set_sched_clock_stable(void);
+extern void clear_sched_clock_stable(void);
  
  extern void sched_clock_tick(void);
  extern void sched_clock_idle_sleep_event(void);
@@ -1969,7 +2128,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
  extern bool yield_to(struct task_struct *p, bool preempt);
  extern void set_user_nice(struct task_struct *p, long nice);
  extern int task_prio(const struct task_struct *p);
-extern int task_nice(const struct task_struct *p);
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
+ */
+static inline int task_nice(const struct task_struct *p)
+{
+       return PRIO_TO_NICE((p)->static_prio);
+}
  extern int can_nice(const struct task_struct *p, const int nice);
  extern int task_curr(const struct task_struct *p);
  extern int idle_cpu(int cpu);
@@ -1977,6 +2145,8 @@ extern int sched_setscheduler(struct task_struct *, int,
                               const struct sched_param *);
  extern int sched_setscheduler_nocheck(struct task_struct *, int,
                                       const struct sched_param *);
+extern int sched_setattr(struct task_struct *,
+                        const struct sched_attr *);
  extern struct task_struct *idle_task(int cpu);
  /**
   * is_idle_task - is the specified task an idle task?
@@ -2056,7 +2226,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
  #else
   static inline void kick_process(struct task_struct *tsk) { }
  #endif
-extern void sched_fork(unsigned long clone_flags, struct task_struct *p);
+extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
  
  extern void proc_caches_init(void);
@@ -2182,8 +2352,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task);
  extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
  /* Remove the current tasks stale references to the old mm_struct */
  extern void mm_release(struct task_struct *, struct mm_struct *);
-/* Allocate a new mm structure and copy contents from tsk->mm */
-extern struct mm_struct *dup_mm(struct task_struct *tsk);
  
  extern int copy_thread(unsigned long, unsigned long, unsigned long,
                         struct task_struct *);
@@ -2201,14 +2369,14 @@ extern void do_group_exit(int);
  extern int allow_signal(int);
  extern int disallow_signal(int);
  
-extern int do_execve(const char *,
+extern int do_execve(struct filename *,
                      const char __user * const __user *,
                      const char __user * const __user *);
  extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
  struct task_struct *fork_idle(int);
  extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
  
-extern void set_task_comm(struct task_struct *tsk, char *from);
+extern void set_task_comm(struct task_struct *tsk, const char *from);
  extern char *get_task_comm(char *to, struct task_struct *tsk);
  
  #ifdef CONFIG_SMP
@@ -2241,6 +2409,16 @@ extern bool current_is_single_threaded(void);
  #define while_each_thread(g, t) \
         while ((t = next_thread(t)) != g)
  
+#define __for_each_thread(signal, t)   \
+       list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t)          \
+       __for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t)  \
+       for_each_process(p) for_each_thread(p, t)
+
  static inline int get_nr_threads(struct task_struct *tsk)
  {
         return tsk->signal->nr_threads;
@@ -2645,6 +2823,21 @@ static inline bool __must_check current_clr_polling_and_test(void)
  }
  #endif
  
+static inline void current_clr_polling(void)
+{
+       __current_clr_polling();
+
+       /*
+        * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+        * Once the bit is cleared, we'll get IPIs with every new
+        * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+        * fold.
+        */
+       smp_mb(); /* paired with resched_task() */
+
+       preempt_fold_need_resched();
+}
+
  static __always_inline bool need_resched(void)
  {
         return unlikely(tif_need_resched());