Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The biggest change affects group scheduling: we now track the runnable
  average on a per-task entity basis, allowing a smoother, exponential
  decay average based load/weight estimation instead of the previous
  binary on-the-runqueue/off-the-runqueue load weight method.

  This will inevitably disturb workloads that were in some sort of
  borderline balancing state or unstable equilibrium, so an eye has to
  be kept on regressions.

  For that reason the new load average is only limited to group
  scheduling (shares distribution) at the moment (which was also hurting
  the most from the prior, crude weight calculation and whose scheduling
  quality wins most from this change) - but we plan to extend this to
  regular SMP balancing as well in the future, which will simplify and
  speed up things a bit.

  Other changes involve ongoing preparatory work to extend NOHZ to the
  scheduler as well, eventually allowing completely irq-free user-space
  execution."

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled"
  cputime: Comment cputime's adjusting code
  cputime: Consolidate cputime adjustment code
  cputime: Rename thread_group_times to thread_group_cputime_adjusted
  cputime: Move thread_group_cputime() to sched code
  vtime: Warn if irqs aren't disabled on system time accounting APIs
  vtime: No need to disable irqs on vtime_account()
  vtime: Consolidate a bit the ctx switch code
  vtime: Explicitly account pending user time on process tick
  vtime: Remove the underscore prefix invasion
  sched/autogroup: Fix crash on reboot when autogroup is disabled
  cputime: Separate irqtime accounting from generic vtime
  cputime: Specialize irq vtime hooks
  kvm: Directly account vtime to system on guest switch
  vtime: Make vtime_account_system() irqsafe
  vtime: Gather vtime declarations to their own header file
  sched: Describe CFS load-balancer
  sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking
  sched: Make __update_entity_runnable_avg() fast
  sched: Update_cfs_shares at period edge
  ...
This commit is contained in:
Linus Torvalds
2012-12-11 18:21:38 -08:00
26 changed files with 1087 additions and 340 deletions

View File

@@ -435,14 +435,29 @@ struct cpu_itimer {
u32 incr_error;
};
/**
* struct cputime - snaphsot of system and user cputime
* @utime: time spent in user mode
* @stime: time spent in system mode
*
* Gathers a generic snapshot of user and system time.
*/
struct cputime {
cputime_t utime;
cputime_t stime;
};
/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in &cputime_t units
* @stime: time spent in kernel mode, in &cputime_t units
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
* This structure groups together three kinds of CPU time that are
* tracked for threads and thread groups. Most things considering
* This is an extension of struct cputime that includes the total runtime
* spent by the task from the scheduler point of view.
*
* As a result, this structure groups together three kinds of CPU time
* that are tracked for threads and thread groups. Most things considering
* CPU time want to group these counts together and treat all three
* of them in parallel.
*/
@@ -583,7 +598,7 @@ struct signal_struct {
cputime_t gtime;
cputime_t cgtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1064,6 +1079,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
@@ -1098,6 +1114,18 @@ struct load_weight {
unsigned long weight, inv_weight;
};
struct sched_avg {
/*
* These sums represent an infinite geometric series and so are bound
* above by 1024/(1-y). Thus we only need a u32 to store them for for all
* choices of y < 1-2^(-32)*1024.
*/
u32 runnable_avg_sum, runnable_avg_period;
u64 last_runnable_update;
s64 decay_count;
unsigned long load_avg_contrib;
};
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
@@ -1158,6 +1186,15 @@ struct sched_entity {
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
/*
* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
* removed when useful for applications beyond shares distribution (e.g.
* load-balance).
*/
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
/* Per-entity load-tracking */
struct sched_avg avg;
#endif
};
struct sched_rt_entity {
@@ -1321,7 +1358,7 @@ struct task_struct {
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
struct cputime prev_cputime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
@@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t)
__put_task_struct(t);
}
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
/*
* Per process flags