Linux Kernel 2.6 20321131
Kernel 2.4...3 Kernel 2.4...3 Kernel 2.4...3 Kernel 2.6...3...3 1....3 2....4 3. load estimator...4 4....4 5....4...4 1....4 2. runqueue kernel/sched.c...4 3. task_struct(include/linux/sched.h)...6...9 1....9 2....9 3. sleep_avg...10 4.... 11 5. schedule() kernel/sched.c...13 6....15 7....17 8....18 Kernel 2.6...20
Kernel 2.4 Kernel 2.4 Kernel 2.4 2.4 2.4 2.4 1. O(n) (1) 2.4 CPU goodness() (2) CPU runqueue 2. 3. Kernel 2.6 Linux Kernel 2.6.4 Kernel2.6 O(1) 1. CPU active array expired array Active array CPU Expired array active array active array expired array Active array expired array CPU active array Kernel 2.4
2. CPU 2.4 Kernel 2.6 64bit bitmap bitmap 1 BSFL 3. load estimator boost punish CPU bonus 4. 2.6 2.4 5. CPU pull CPU push 1. Kernel 2.6 0 ~ MAX_PRIO-1 0 ~ MAX_RT_PRIO-1 MAX_RT_PRIO ~ MAX_PRIO-1 2. runqueue kernel/sched.c struct runqueue 2.6 CPU (1) prio_array_t *active, *expired, arrays[2] runqueue CPU active expired prio_array_t struct prio_array { int nr_active; /* */ struct list_head queue[max_prio]; /* */ unsigned long bitmap[bitmap_size]; /* */
}; 1 prio_array queue[max_prio] i MAX_PRIO>i>=0 task_struct::runlist runnlist task_struct 1 queue[max_prio] struct prio_array unsigned long bitmap[bitmap_size] queue[max_prio] bitmap bit queue[i] queue[i] bitmap 1 0 1 idx sched_find_first_bit() queue[i]->next task_struct::runlist active array expired array active array active expired 2.4 2.6 2.6 struct runqueue sched.c sched.h (2) spinlock_t lock runqueue runqueue CPU runqueue (3) task_t *curr CPU current CPU runqueue curr rq->curr = next; rq->curr current rq->curr (4) unsigned long expired_timestamp
active array jiffies expired array EXPIRED_STARVING() expired array scheduler_tick() (5) unsigned long nr_running, nr_switches, nr_uninterruptible, timestamp_last_tick CPU nr_running CPU active array expired array nr_switches CPU nr_uninterruptible CPU timestamp_last_tick (6) struct list_head migration_queue CPU migration_req_t migration_req_t::list push 3. task_struct(include/linux/sched.h) Linux PCB Linux PCB task_struct task_struct 10 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. task_struct (1) volatile long state include/linux/sched.h 6 #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 #define TASK_STOPPED 4 #define TASK_ZOMBIE 8 #define TASK_DEAD 16 TASK_DEAD (2) struct thread_info *thread_info s32 preempt_count; unsigned long flags; preempt_count 0 0 flags TIF_NEED_RESCHED Kernel 2.4 need_resched 1
(3) int prio, static_prio prio Kernel2.4 goodness() Kernel2.6 prio static_prio nice nice -20 ~ 19 kernel/sched.c nice prio prioity nice #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) prioity nice priority = MAX _ RT _ PRIO + nice + 20 (4) struct list_head run_list prio_array queue[max_prio] list_head run_list include/linux/list.h struct list_head task_struct (5) prio_array_t *array CPU active array active array kernel/sched.c array = next->array; dequeue_task(next, array); recalc_task_prio(next, next->timestamp + delta); enqueue_task(next, array); 2 UP runqueue::active active array SMP next thread_info thread_info cpu next cpu runqueue active schedule (6) unsigned long sleep_avg nanosecond 0 ~ NS_MAX_SLEEP_AVG sleep_avg Kernel 2.6 sleep_avg (7) long interactive_credit -CREDIT_LIMIT ~ CREDIT_LIMIT+1 1 1 1 interactive_credit CREDIT_LIMIT interactive_credit
CREDIT_LIMIT+1 (8) unsigned long long timestamp sleep_avg activate_task() kernel/sched.c p->timestamp = now expired array schedule() kernel/sched.c expired array prev->timestamp = now active array schedule() kernel/sched.c active array next->timestamp = now CPU pull timestamp sched_clock() - (src_rq->timestamp_last_tick - p->timestamp) CPU (9) int activated actived=-1 sleep TASK_UNINTERRUPTIBLE try_to_wake_up() actived=0 actived=1 TASK_INTERRUPTIBLE credit activate_task() actived=2 TASK_INTERRUPTIBLE activate_task() (10) unsigned long policy 2.4 SCHED_FIF CPU SCHED_RR SCHED_OTHER (11) unsigned int time_slice, first_time_slice time_slice Kernel 2.4 counter first_time_slice 0
1. (1) static_prio prio nice static_prio sleep_avg effecitve_prio() kernel/sched.c sleep_avg -MAX_BONUS/2 ~ MAX_BONUS/2 bonus MAX_BONUS MAX_USER_P RIO* PRIO_BONUS_RATIO / 100 = 10 sleep_avg -5 ~ 5 bonus = (NS_TO_JIFFIES((p)- > sleep_avg)* MAX_BONUS / MAX_SLEEP_AVG) - MAX_BONUS / 2 prio = static _ prio bonus MAX_RT_PRIO MAX_PRIO-1 sleep_avg bonus sleep_avg bonus (2) effective_prio() recalc_task_prio () recalc_task_prio () actived interactive_credit sleep_avg sleep_avg effective_prio() a. effective_prio() sleep_avg interactive_credit b. recalc_task_prio () c. active array expired array effective_prio() d. IDLE 2. (1) time_slice task_timeslice() kernel/sched.c [MAX_RT_PRIO, MAX_PRIO-1] [MIN_TIMESLICE, MAX_TIMESLICE] timeslice = MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1- (p)->static_prio) / (MAX_USER_PRIO-1)) 2 static_prio timeslice
2 static_prio time_slice (2) 0 Linux for (each task on the system) { recalculate priority; recalculate timeslice } O(n) task_struct Kernel 2.6 task_timeslice() a. sched_fork b. scheduler_tick c. first_timeslice Kernel2.6 3. sleep_avg sleep_avg Kernel 2.6 sleep_avg (1) sleep_avg PARENT_PENALTY / 100
sleep_avg CHILD_PENALTY / 100 PARENT_PENALTY=100 CHILD_PENALTY = 95 sleep_avg (2) acitvate_task() recalc_task_prio() sleep_avg sleep_avg 0 ~ NS_MAX_SLEEP_AVG MAX_SLEEP_AVG - AVG_TIMESLICE p->mm UNINTERRUPTIBLE p->activated!= -1 INTERACTIVE_SLEEP(p) p->mm UNINTERRUPTIBLE p->activated == -1!HIGH_CREDIT(p) sleep_avg INTERACTIVE_SLEEP(p) INTERACTIVE_SLEEP(p) INTERACTIVE_SLEEP(p) sleep_avg INTERACTIVE_SLEEP(p) sleep_avg+sleep_time sleep_avg (3) schedule() TASK_INTERRUPTIBLE actived>0 actived recalc_task_prio() (2) sleep_time actived=1 sleep_time (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128 (4) CPU CPU sleep_avg run_time run_time sleep_avg 0 (5) sleep_avg p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT+1) * EXIT_WEIGHT + p->sleep_avg /(EXIT_WEIGHT + 1); sleep_avg 1/( EXIT_WEIGHT+1) sleep_avg 1/( EXIT_WEIGHT+1) 4. Kernel 2.6
(1) interactive_credit sleep_avg interactive_credit task_struct 0 interactive_credit 1 recalc_task_prio() a. (p->mm!=null) TASK_UNINTERRUPTIBLE p->activated!=-1 sleep_time>interactive_sleep(p) interactive_credit 1 b. NS_MAX_SLEEP_AVG interactive_credit 1 interactive_credit 1 schedule() CPU run_time sleep_avg sleep_avg 0 interactive_credit HIGH_CREDIT(p) LOW_CREDIT(p) interactive_credit 1 interactive_credit -(CREDIT_LIMIT+1) ~ (CREDIT_LIMIT+1) interactive_credit CREDIT_LIMIT+1 interactive_credit HIGH_CREDIT() CPU run_time /= (CURRENT_BONUS(prev)? : 1) sleep_avg sleep_avg sleep_avg p->sleep_avg += sleep_time sleep_avg sleep_avg (2) sleep_avg sleep_avg TASK_INTERRUPTIBLE sleep_avg sleep_avg (3) TASK_INTERACTIVE() active array active array (p)->prio <= (p)->static_prio - DELTA(p) DELTA(p) = (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
SCALE(v1,v1_max,v2_max) = (v1) * (v2_max) / (v1_max) TASK_INTERACTIVE() nice n p nice 12 nice>12 DELTA(p)>5 sleep_avg 5 TASK_INTERACTIVE(p) nice -20 sleep_avg TASK_INTERACTIVE(p) nice active array 5. schedule() kernel/sched.c schedule() Kernel (1) prev next prev CPU next CPU a. b. preempt_disable() c. current prev CPU rq prev prev prev run_time interactive_credit rq rq d. prev prev TASK_RUNNING prev rq schedule() e. rq 0 SMP next idle rq->expired_timestamp = 0 expired_timestamp a. rq active array 0 active array expired array array = rq->active; rq->active = rq->expired; rq->expired = array; rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO; active array expired array b. sched_find_first_bit() idx queue[idx]->next next idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); 3 c. next TASK_INTERRUPTIBLE actived>0 prio a. prev!=next b. next->timestamp rq->curr=next rq->curr current c. Kernel 2.4 prev mm rq current current=next=rq->curr TIF_NEED_RESCHED (2) schedule() Kernel 2.4 schedule() schedule()
task_struct need_resched schedule() need_resched a. CPU b. CPU c. nice Kernel2.6 Kernel 2.4 need_resched bit 6. (1) sched_init() sched_init() init/main.c start_kernel() CPU runqueue SMP CPU wake_up_forked_process() wake_up_forked_process timer (2) sched_fork(task_t *p) fork kenel/fork.c do_fork()->copy_process->sched_fork() TASK_RUNNIG runqueue runlist array fork sched_tail preempt_count 1 first_timeslice 1 fork CPU 0 1 scheduler_tick() scheduler_tick (3) wake_up_forked_process(task_t * p) fork kernel/fork.c do_fork() fork sleep_avg sleep_avg interactive_credit 0 prio cpu active array idle activate_task(p, rq) active array (4) schedule_tail()
fork() entry.s fork (5) scheduler_tick() update_process_time() SCHED_RR SCHED_FIFO TIF_NEED_RESCHED a. TIF_NEED_RESCHED first_timeslice 0 rq-> expired_timestamp expired_timestamp b. TASK_INTERACTIVE() TASK_INTERACTIVE EXPIRED_STARVING(rq) expired array (STARVATION_LIMIT && ((rq)->expired_timestamp&&(jiffies- (rq)->expired_timestamp >= STARVATION_LIMIT * ((rq)->nr_running) + 1))) ((rq)->curr->static_prio > (rq)->best_expired_prio) EXPIRED_STARVING() EXPIRED_STARVING() STARVATION_LIMIT (rq)->expired_timestamp (rq)->expired_timestamp >= STARVATION_LIMIT * ((rq)->nr_running) + 1) expired array ((rq)->curr->static_prio > (rq)->best_expired_prio) expired array c. EXPIRED_STARVING() active array TASK_INTERACTIVE expired array 1 CPU active array RR a. sleep_avg sleep_avg CPU-bound cache b. CPU CPU (6) sleep wake up interruptible running
4 TASK_INTERRUPTBLE TASK_RUNNING wait_for_completion() try_to_wake_up() wait_for_completion() a. DECLARE_WAITQUEUE() b. add_wait_queue_tail() c. TASK_UNINTERRUPTIBLE d. schedule() e. remove_wait_queue() schedule() try_to_wake_up() activate_task() rq->curr curr TIF_NEED_RESCHED TASK_RUNNING (7) sched_exit(task_t *p) release_task() p->first_timeslice first_timeslice 1 p fork p p->sleep_avg < p->parent->sleep_avg sleep_avg 7. Kernel 2.6 Kernel 2.6 2.4 (1) preempt_disable() preempt_count 1
(2) preempt_count preempt_count arch/i386/kernel/entry.s ENTRY(resume_kernel) cmpl $0,TI_PRE_COUNT(%ebp) # non-zero preempt_count? jnz restore_all need_resched: movl TI_FLAGS(%ebp), %ecx # need_resched set? testb $_TIF_NEED_RESCHED, %cl jz restore_all testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path)? jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti call schedule movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched preempt_count 0 0 0 TIF_NEED_RESCHED EFLAGS IF IF schedule() preempt_enable() a. preempt_count 1 b. TIF_NEED_RESCHED 0 preempt_schedule() preempt_count PREEMPT_ACTIVE schedule() schedule() 8. Kernel 2.6 pull push (1) pull CPU CPU CPU pull load_balance() load_balance() CPU idle=1 idle=0 idle=1
idle=0 CPU rebalance_tick() BUSY_REBALANCE_TICK load_balance() load_balance() CPU CPU CPU CPU CPU 25% CPU expired active a. b. cpus_allowed CPU c. CPU cache_decay_ticks cache a. CPU b. CPU CPU active array c. timestamp d. CPU TIF_NEED_RESCHED imbalance idle=1 load_balance() IDLE_REBALANCE_TICK load_balance() schedule() CPU rq load_balance() idle load_balance() idle=0 idle idle=1 (2) push migration_thread() CPU rq->migration_queue push CPU Kernel 2.6 CPU SCHED_FIFO migration_queue set_cpus_allowed() CPU migration_req_t CPU migration_queue migration_thread migration_thread() move_task_away() CPU activate_task() CPU CPU rq->curr resched_task() CPU TIF_NEED_RESCHED 1
Kernel 2.6 : [1] Linus Tovalds, Linux v2.6.4, from www.kernel.org [2] Robert Love, Linux Kernel Development [3], Linux 2.6