timer-hash-04.09.02.patch: This patch changes the implementation of the kernel timer lists from cascaded arrays of lists to a hashed array of lists. This simplifies the management of the lists, and eliminates the overhead of re-cascading the lists. To provide timer scaleability, a kernel config option is introduced to set the hash array size. Signed-off-by: Geoff Levand for CELF --- init/Kconfig | 11 +++ kernel/timer.c | 184 ++++++++++++++++++++++----------------------------------- 2 files changed, 83 insertions(+), 112 deletions(-) diff -ruN -X ./dontdiff tag_LINUX_2_6_8_1/init/Kconfig branch_TIMER_HASH/init/Kconfig --- tag_LINUX_2_6_8_1/init/Kconfig 2004-09-02 15:27:15.000000000 -0700 +++ branch_TIMER_HASH/init/Kconfig 2004-09-02 16:43:53.000000000 -0700 @@ -178,6 +178,17 @@ 13 => 8 KB 12 => 4 KB +config TIMER_LIST_COUNT + int "Timer list count (9 => 512, 10 => 1024)" + default 9 + help + This value sets as a power of 2 the size (bucket count) of the + hash array the kernel uses for managing timers. + 8 => 256 + 9 => 512 (default) + 10 => 1024 + If unsure, just use the default value. + config HOTPLUG bool "Support for hot-pluggable devices" if !ARCH_S390 default ARCH_S390 diff -ruN -X ./dontdiff tag_LINUX_2_6_8_1/kernel/timer.c branch_TIMER_HASH/kernel/timer.c --- tag_LINUX_2_6_8_1/kernel/timer.c 2004-09-02 15:27:20.000000000 -0700 +++ branch_TIMER_HASH/kernel/timer.c 2004-09-02 16:43:54.000000000 -0700 @@ -37,33 +37,18 @@ #include #include +#define LIST_COUNT (1 << (CONFIG_TIMER_LIST_COUNT)) +#define HASH_MASK (LIST_COUNT - 1) + /* * per-CPU timer vector definitions: */ -#define TVN_BITS 6 -#define TVR_BITS 8 -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -typedef struct tvec_s { - struct list_head vec[TVN_SIZE]; -} tvec_t; - -typedef struct tvec_root_s { - struct list_head vec[TVR_SIZE]; -} tvec_root_t; struct tvec_t_base_s { spinlock_t lock; unsigned long timer_jiffies; struct timer_list *running_timer; - tvec_root_t tv1; - tvec_t tv2; - tvec_t tv3; - tvec_t tv4; - tvec_t tv5; + struct list_head tv[LIST_COUNT]; } ____cacheline_aligned_in_smp; typedef struct tvec_t_base_s tvec_base_t; @@ -103,47 +88,50 @@ check_timer_failed(timer); } - static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) { - unsigned long expires = timer->expires; - unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; - - if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = base->tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = base->tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = base->tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = base->tv4.vec + i; - } else if ((signed long) idx < 0) { + struct list_head *pos; + + if(time_before_eq(timer->expires, base->timer_jiffies)) + { /* - * Can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); - } else { - int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + * Expired, schedule for next tick. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; - expires = idx + base->timer_jiffies; + + const unsigned int idx = base->timer_jiffies & HASH_MASK; + + pos = (&base->tv[idx])->next; + } + else + { + const unsigned int idx = timer->expires & HASH_MASK; + + if ((timer->expires - base->timer_jiffies) < LIST_COUNT) + { + /* + * Expires within the range of the hash array, + * so put at head of list and avoid doing search. + */ + + pos = (&base->tv[idx])->next; + } + else + { + /* + * Search from the tail, looking for insert point. + */ + + struct list_head *list_start = &base->tv[idx]; + for (pos = list_start->prev; pos != list_start; pos = pos->prev) { + struct timer_list *temp + = list_entry(pos, struct timer_list, entry); + if (time_before(temp->expires, timer->expires)) + break; + } } - i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = base->tv5.vec + i; } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); + + list_add_tail(&timer->entry, pos); } int __mod_timer(struct timer_list *timer, unsigned long expires) @@ -387,78 +375,55 @@ EXPORT_SYMBOL(del_singleshot_timer_sync); #endif -static int cascade(tvec_base_t *base, tvec_t *tv, int index) -{ - /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; - - head = tv->vec + index; - curr = head->next; - /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. - */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); - } - INIT_LIST_HEAD(head); - - return index; -} - /*** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. * - * This function cascades all vectors and executes all expired timer - * vectors. + * Called from softirq which calls with irq enabled. + * We may assume this and not save the flags. */ -#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK -static inline void __run_timers(tvec_base_t *base) +static void __run_timers(tvec_base_t *base) { - struct timer_list *timer; - spin_lock_irq(&base->lock); - while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; - + + do { + const unsigned int idx = base->timer_jiffies & HASH_MASK; + struct list_head * const head = &base->tv[idx]; + /* - * Cascade timers: + * Continue to pick the first entry from head until all + * are proccessed. This allows new entries to be + * added while unlocked during the function call. */ - if (!index && - (!cascade(base, &base->tv2, INDEX(0))) && - (!cascade(base, &base->tv3, INDEX(1))) && - !cascade(base, &base->tv4, INDEX(2))) - cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); -repeat: - if (!list_empty(head)) { + + while (!list_empty(head)) { + struct timer_list *timer; void (*fn)(unsigned long); unsigned long data; - timer = list_entry(head->next,struct timer_list,entry); - fn = timer->function; - data = timer->data; + timer = list_entry(head->next, struct timer_list, entry); + + if (time_after(timer->expires, jiffies)) + break; + + fn = timer->function; + data = timer->data; list_del(&timer->entry); + timer->entry.next = timer->entry.prev = NULL; set_running_timer(base, timer); smp_wmb(); timer->base = NULL; + spin_unlock_irq(&base->lock); fn(data); spin_lock_irq(&base->lock); - goto repeat; } - } + ++base->timer_jiffies; + + } while(time_before(base->timer_jiffies, jiffies)); + set_running_timer(base, NULL); spin_unlock_irq(&base->lock); } @@ -1315,14 +1280,9 @@ base = &per_cpu(tvec_bases, cpu); spin_lock_init(&base->lock); - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); + + for (j = 0; j < LIST_COUNT; j++) + INIT_LIST_HEAD(base->tv + j); base->timer_jiffies = jiffies; }