Hi I dislike the behavior of the SCHED_ISO patch that iso tasks are degraded to SCHED_NORMAL if they exceed the limit. IMHO it's better to throttle them at the iso_cpu limit.
I have modified Con's iso2 patch to do this. If iso_cpu > 50 iso tasks only get stalled for 1 tick (1ms on x86). Fortunately there is a currently unused task prio (MAX_RT_PRIO-1) [1]. I used it for ISO_PRIO. All SCHED_ISO tasks use it and they not changing to other priorities. SCHED_ISO is a realtime class with the specialty that it can preempted by SCHED_NORMAL tasks if iso_throttle is set. With this the iso queue stuff is not needed. iso_throttle controls if a SCHED_ISO task can be preempted. It's set by the RT task load. With my patch rt_task() also includes iso tasks. I have added a posix_rt_task() for only SCHED_FIFO and SCHED_RR. I changed the iso_period sysctl to iso_timeout which is in centisecs. A iso_throttle_count sysctl is added which count the ticks when a iso task is preempted by the timer. It uses currently a simple global variable. It should be per runqueue. And i'm not sure a sysctl is an appropriate place for it (/sys, /proc?). It's for 2.6.11-rc1 and i have tested it only on UP x86. I'm a kernel hacker newbie. Please tell me if this is nonsense, good, can be improved, ... utz [1] Actually MAX_RT_PRIO-1 is used by sched_idle_next() and migration_call(). I changed it to MAX_RT_PRIO-2 for them. I think it's ok. diff -Nrup linux-2.6.11-rc1/include/linux/sched.h linux-2.6.11-rc1-uiso2/include/linux/sched.h --- linux-2.6.11-rc1/include/linux/sched.h 2005-01-21 19:46:54.677616421 +0100 +++ linux-2.6.11-rc1-uiso2/include/linux/sched.h 2005-01-21 20:30:29.616340716 +0100 @@ -130,6 +130,24 @@ extern unsigned long nr_iowait(void); #define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +/* policy 3 reserved for SCHED_BATCH */ +#define SCHED_ISO 4 + +extern int iso_cpu, iso_timeout; +extern int iso_throttle_count; +extern void account_iso_ticks(struct task_struct *p); + +#define SCHED_RANGE(policy) ((policy) == SCHED_NORMAL || \ + (policy) == SCHED_FIFO || \ + (policy) == SCHED_RR || \ + (policy) == SCHED_ISO) + +#define SCHED_RT(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR || \ + (policy) == SCHED_ISO) + +#define SCHED_POSIX_RT(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) struct sched_param { int sched_priority; @@ -342,9 +360,11 @@ struct signal_struct { /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT - * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are - * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values - * are inverted: lower p->prio value means higher priority. + * priority is 0..MAX_RT_PRIO-1. SCHED_FIFO and SCHED_RR uses + * 0..MAX_RT_PRIO-2, SCHED_ISO uses MAX_RT_PRIO-1. + * SCHED_NORMAL tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. + * Priority values are inverted: lower p->prio value means + * higher priority. * * The MAX_USER_RT_PRIO value allows the actual maximum * RT priority to be separate from the value exported to @@ -358,7 +378,12 @@ struct signal_struct { #define MAX_PRIO (MAX_RT_PRIO + 40) +#define ISO_PRIO (MAX_RT_PRIO - 1) + #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) +#define posix_rt_task(p) (unlikely((p)->policy == SCHED_FIFO || \ + (p)->policy == SCHED_RR)) +#define iso_task(p) (unlikely((p)->policy == SCHED_ISO)) /* * Some day this will be a full-fledged user tracking system.. diff -Nrup linux-2.6.11-rc1/include/linux/sysctl.h linux-2.6.11-rc1-uiso2/include/linux/sysctl.h --- linux-2.6.11-rc1/include/linux/sysctl.h 2005-01-21 19:46:54.717612339 +0100 +++ linux-2.6.11-rc1-uiso2/include/linux/sysctl.h 2005-01-21 20:30:38.105484416 +0100 @@ -135,6 +135,9 @@ enum KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */ + KERN_ISO_CPU=68, /* int: cpu% allowed by SCHED_ISO class */ + KERN_ISO_TIMEOUT=69, /* int: centisecs after SCHED_ISO is throttled */ + KERN_ISO_THROTTLE_COUNT=70, /* int: no. of throttled SCHED_ISO ticks */ }; diff -Nrup linux-2.6.11-rc1/kernel/sched.c linux-2.6.11-rc1-uiso2/kernel/sched.c --- linux-2.6.11-rc1/kernel/sched.c 2005-01-21 19:46:55.650517137 +0100 +++ linux-2.6.11-rc1-uiso2/kernel/sched.c 2005-01-21 23:35:11.531981295 +0100 @@ -149,9 +149,6 @@ (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] @@ -171,6 +168,11 @@ static unsigned int task_timeslice(task_ else return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); } + +int iso_cpu = 70; /* The soft %cpu limit on SCHED_ISO tasks */ +int iso_timeout = 500; /* Cenitsecs after SCHED_ISO is throttled */ +int iso_throttle_count = 0; /* No. of throttled SCHED_ISO ticks */ + #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) @@ -206,6 +208,8 @@ struct runqueue { #ifdef CONFIG_SMP unsigned long cpu_load; #endif + long iso_ticks; + int iso_throttle; unsigned long long nr_switches; /* @@ -297,6 +301,19 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif +static inline int task_preempts_curr(task_t *p, runqueue_t *rq) +{ + if (unlikely(rq->iso_throttle)) { + if (iso_task(p)) + return 0; + if (iso_task(rq->curr)) + return 1; + } + if (p->prio < rq->curr->prio) + return 1; + return 0; +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -1101,7 +1118,7 @@ out_activate: */ activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) + if (task_preempts_curr(p, rq)) resched_task(rq->curr); } success = 1; @@ -1257,7 +1274,7 @@ void fastcall wake_up_new_task(task_t * p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) + if (task_preempts_curr(p, rq)) resched_task(rq->curr); schedstat_inc(rq, wunt_moved); @@ -1634,7 +1651,7 @@ void pull_task(runqueue_t *src_rq, prio_ * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - if (TASK_PREEMPTS_CURR(p, this_rq)) + if (task_preempts_curr(p, this_rq)) resched_task(this_rq->curr); } @@ -2315,6 +2332,33 @@ static void check_rlimit(struct task_str } /* + * Account RT tasks for SCHED_ISO throttle. Called every timer tick. + * @p: the process that gets accounted + */ +void account_iso_ticks(task_t *p) +{ + runqueue_t *rq = this_rq(); + + if (rt_task(p)) { + if (!rq->iso_throttle) { + rq->iso_ticks += (100 - iso_cpu); + } + } else { + rq->iso_ticks -= iso_cpu; + if (rq->iso_ticks < 0) + rq->iso_ticks = 0; + } + + if (rq->iso_ticks > + (iso_timeout * (100 - iso_cpu) * HZ / 100 + 100)) { + rq->iso_throttle = 1; + } else { + rq->iso_throttle = 0; + } + +} + +/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -2427,7 +2471,7 @@ void scheduler_tick(void) * timeslice. This makes it possible for interactive tasks * to use up their timeslices at their highest priority levels. */ - if (rt_task(p)) { + if (posix_rt_task(p)) { /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. @@ -2442,6 +2486,22 @@ void scheduler_tick(void) } goto out_unlock; } + + if (iso_task(p)) { + if (rq->iso_throttle) { + iso_throttle_count++; + set_tsk_need_resched(p); + goto out_unlock; + } + if (!(--p->time_slice % GRANULARITY)) { + requeue_task(p, rq->active); + set_tsk_need_resched(p); + } + if (!p->time_slice) + p->time_slice = task_timeslice(p); + goto out_unlock; + } + if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2646,6 +2706,20 @@ EXPORT_SYMBOL(sub_preempt_count); #endif +static inline void expire_all_iso_tasks(prio_array_t *active, + prio_array_t *expired) +{ + struct list_head *queue; + task_t *next; + + queue = active->queue + ISO_PRIO; + while (!list_empty(queue)) { + next = list_entry(queue->next, task_t, run_list); + dequeue_task(next, active); + enqueue_task(next, expired); + } +} + /* * schedule() is the main scheduler function. */ @@ -2753,6 +2827,7 @@ go_idle: } array = rq->active; +switch_to_expired: if (unlikely(!array->nr_active)) { /* * Switch the active and expired arrays. @@ -2767,6 +2842,21 @@ go_idle: schedstat_inc(rq, sched_noswitch); idx = sched_find_first_bit(array->bitmap); + if (unlikely(rq->iso_throttle && (idx == ISO_PRIO))) { + idx = find_next_bit(array->bitmap, MAX_PRIO, ISO_PRIO + 1); + if (idx >= MAX_PRIO) { + /* + * only SCHED_ISO tasks in active array + */ + if (rq->expired->nr_active) { + expire_all_iso_tasks(array, rq->expired); + goto switch_to_expired; + } else { + idx = ISO_PRIO; + } + } + } + queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); @@ -3213,7 +3303,8 @@ static void __setscheduler(struct task_s BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL) + + if (SCHED_RT(policy)) p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; @@ -3238,9 +3329,8 @@ recheck: /* double check policy once rq lock held */ if (policy < 0) policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL) - return -EINVAL; + else if (!SCHED_RANGE(policy)) + return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. @@ -3248,12 +3338,19 @@ recheck: if (param->sched_priority < 0 || param->sched_priority > MAX_USER_RT_PRIO-1) return -EINVAL; - if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) + if ((!SCHED_POSIX_RT(policy)) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - !capable(CAP_SYS_NICE)) - return -EPERM; + if (SCHED_POSIX_RT(policy) && !capable(CAP_SYS_NICE)) { + /* + * If the caller requested an POSIX RT policy without + * having the necessary rights, we downgrade the policy + * to SCHED_ISO. Temporary hack for testing. + */ + policy = SCHED_ISO; + param->sched_priority = 0; + } + if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -3287,7 +3384,7 @@ recheck: if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) + } else if (task_preempts_curr(p, rq)) resched_task(rq->curr); } task_rq_unlock(rq, &flags); @@ -3714,6 +3811,7 @@ asmlinkage long sys_sched_get_priority_m ret = MAX_USER_RT_PRIO-1; break; case SCHED_NORMAL: + case SCHED_ISO: ret = 0; break; } @@ -3737,6 +3835,7 @@ asmlinkage long sys_sched_get_priority_m ret = 1; break; case SCHED_NORMAL: + case SCHED_ISO: ret = 0; } return ret; @@ -4010,7 +4109,7 @@ static void __migrate_task(struct task_s + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); activate_task(p, rq_dest, 0); - if (TASK_PREEMPTS_CURR(p, rq_dest)) + if (task_preempts_curr(p, rq_dest)) resched_task(rq_dest->curr); } @@ -4181,7 +4280,7 @@ void sched_idle_next(void) */ spin_lock_irqsave(&rq->lock, flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-2); /* Add idle task to _front_ of it's priority queue */ __activate_idle_task(p, rq); @@ -4265,7 +4364,7 @@ static int migration_call(struct notifie kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); - __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-2); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; diff -Nrup linux-2.6.11-rc1/kernel/sysctl.c linux-2.6.11-rc1-uiso2/kernel/sysctl.c --- linux-2.6.11-rc1/kernel/sysctl.c 2005-01-21 19:46:55.666515504 +0100 +++ linux-2.6.11-rc1-uiso2/kernel/sysctl.c 2005-01-21 20:30:21.820127147 +0100 @@ -219,6 +219,11 @@ static ctl_table root_table[] = { { .ctl_name = 0 } }; +/* Constants for minimum and maximum testing in vm_table. + We use these as one-element integer vectors. */ +static int zero; +static int one_hundred = 100; + static ctl_table kern_table[] = { { .ctl_name = KERN_OSTYPE, @@ -633,15 +638,36 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif + { + .ctl_name = KERN_ISO_CPU, + .procname = "iso_cpu", + .data = &iso_cpu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = KERN_ISO_TIMEOUT, + .procname = "iso_timeout", + .data = &iso_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_ISO_THROTTLE_COUNT, + .procname = "iso_throttle_count", + .data = &iso_throttle_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { .ctl_name = VM_OVERCOMMIT_MEMORY, diff -Nrup linux-2.6.11-rc1/kernel/timer.c linux-2.6.11-rc1-uiso2/kernel/timer.c --- linux-2.6.11-rc1/kernel/timer.c 2005-01-21 19:46:55.672514892 +0100 +++ linux-2.6.11-rc1-uiso2/kernel/timer.c 2005-01-21 20:30:14.254890301 +0100 @@ -815,6 +815,8 @@ void update_process_times(int user_tick) struct task_struct *p = current; int cpu = smp_processor_id(); + account_iso_ticks(p); + /* Note: this timer irq context must be accounted for as well. */ if (user_tick) account_user_time(p, jiffies_to_cputime(1)); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/