2013/11/01(金)jiffies更新を追いかける

Tickless kernelにより、既存の和文資料の示すところから離れだす。
仕方が無いのでソースコードを頼りに追いかけることにする。

主目的は最低限必要なタイマリソースはなにか、の確認。
scheduler、jiffiesの更新がなされれば、要件を満たすので、更新処理を遡る。

kernelはv3.10を参照した(LTSI候補だというのを見かけたはず?)

jiffies更新を追いかける

システム時刻:xtime だったはず

jiffies更新箇所
/*
 * The 64-bit jiffies value is not atomic - you MUST NOT read it
 * without sampling the sequence number in xtime_lock.
 * jiffies is defined in the linker script...
 */
void do_timer(unsigned long ticks)
{
	jiffies_64 += ticks;
	update_wall_time();
	calc_global_load(ticks);
}
コイツが大ボス. jiffiesそのものじゃないけどいいんか( system call "times"で返している)
jiffies更新は別か..

→一緒だった。物理メモリで同一アドレスにしてあるので、64の更新がjiffies更新と一致する(Little Endian)。
FILE: System.map
8xxxxxxx D jiffies
8xxxxxxx D jiffies_64

void xtime_update(unsigned long ticks) write_seqlock(&xtime_lock); do_timer(ticks); write_sequnlock(&xtime_lock);

FILE: kernel/time/tick-sched.c
/*
 * Must be called with interrupts disabled !
 */
static void tick_do_update_jiffies64(ktime_t now)
{
	unsigned long ticks = 0;
	ktime_t delta;

	/*
	 * Do a quick check without holding xtime_lock:
	 */
	delta = ktime_sub(now, last_jiffies_update);
	if (delta.tv64 < tick_period.tv64)
		return;

	/* Reevalute with xtime_lock held */
	write_seqlock(&xtime_lock);

	delta = ktime_sub(now, last_jiffies_update);
	if (delta.tv64 >= tick_period.tv64) {

		delta = ktime_sub(delta, tick_period);
		last_jiffies_update = ktime_add(last_jiffies_update,
						tick_period);

		/* Slow path for long timeouts */
		if (unlikely(delta.tv64 >= tick_period.tv64)) {
			s64 incr = ktime_to_ns(tick_period);

			ticks = ktime_divns(delta, incr);

			last_jiffies_update = ktime_add_ns(last_jiffies_update,
							   incr * ticks);
		}
		do_timer(++ticks);

		/* Keep the tick_next_period variable up to date */
		tick_next_period = ktime_add(last_jiffies_update, tick_period);
	}
	write_sequnlock(&xtime_lock);
}
tick_do_update_jiffies64()を呼び出すヒト.
 static void tick_nohz_update_jiffies(ktime_t now)
 void tick_nohz_stop_sched_tick(int inidle)
  tick_program_event()って?
 static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 void tick_nohz_restart_sched_tick(void)
 static void tick_nohz_handler(struct clock_event_device *dev)  // The nohz low res interrupt handler
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) // high-res. timer持っているとき.
いずれも、ktime_get();で現時刻をとってきている(CPU寝ていても回っているカウンタを想定)


FILE: kernel/time/timekeeping.c
ktime_t ktime_get(void)
{
	unsigned int seq;
	s64 secs, nsecs;

	WARN_ON(timekeeping_suspended);

	do {
		seq = read_seqbegin(&xtime_lock);
		secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
		nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
		nsecs += timekeeping_get_ns();
		/* If arch requires, add in gettimeoffset() */
		nsecs += arch_gettimeoffset();

	} while (read_seqretry(&xtime_lock, seq));
	/*
	 * Use ktime_set/ktime_add_ns to create a proper ktime on
	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
	 */
	return ktime_add_ns(ktime_set(secs, 0), nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);
ktime_get()は、xtimeを見ている。あれ..?

コチラを見ると、http://d.hatena.ne.jp/enakai00/20111117/1321508379
別の所でハードウェアリソースを参照していた。


FILE: kernel/time/timekeeping.c
/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
 */
static void update_wall_time(void)
{
	struct clocksource *clock;
	cycle_t offset;
	int shift = 0, maxshift;

	/* Make sure we're fully resumed: */
	if (unlikely(timekeeping_suspended))
		return;

	clock = timekeeper.clock;        // ★★コレがSoC全域で使える、cpu idleでも時間を刻むクロックソースを握っている。

#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
	offset = timekeeper.cycle_interval;
#else
	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;

	/*
	 * With NO_HZ we may have to accumulate many cycle_intervals
	 * (think "ticks") worth of time at once. To do this efficiently,
	 * we calculate the largest doubling multiple of cycle_intervals
	 * that is smaller then the offset. We then accumulate that
	 * chunk in one go, and then try to consume the next smaller
	 * doubled multiple.
	 */
	shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
	shift = max(0, shift);
	/* Bound shift to one less then what overflows tick_length */
	maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
	shift = min(shift, maxshift);
	while (offset >= timekeeper.cycle_interval) {
		offset = logarithmic_accumulation(offset, shift);
		if(offset < timekeeper.cycle_interval<<shift)
			shift--;
	}

	/* correct the clock when NTP error is too big */
	timekeeping_adjust(offset);

	/*
	 * Since in the loop above, we accumulate any amount of time
	 * in xtime_nsec over a second into xtime.tv_sec, its possible for
	 * xtime_nsec to be fairly small after the loop. Further, if we're
	 * slightly speeding the clocksource up in timekeeping_adjust(),
	 * its possible the required corrective factor to xtime_nsec could
	 * cause it to underflow.
	 *
	 * Now, we cannot simply roll the accumulated second back, since
	 * the NTP subsystem has been notified via second_overflow. So
	 * instead we push xtime_nsec forward by the amount we underflowed,
	 * and add that amount into the error.
	 *
	 * We'll correct this error next time through this function, when
	 * xtime_nsec is not as small.
	 */
	if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
		s64 neg = -(s64)timekeeper.xtime_nsec;
		timekeeper.xtime_nsec = 0;
		timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
	}


	/*
	 * Store full nanoseconds into xtime after rounding it up and
	 * add the remainder to the error difference.
	 */
	xtime.tv_nsec =	((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
	timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
	timekeeper.ntp_error +=	timekeeper.xtime_nsec <<
				timekeeper.ntp_error_shift;

	/*
	 * Finally, make sure that after the rounding
	 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
	 */
	if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
		int leap;
		xtime.tv_nsec -= NSEC_PER_SEC;
		xtime.tv_sec++;
		leap = second_overflow(xtime.tv_sec);
		xtime.tv_sec += leap;
		wall_to_monotonic.tv_sec -= leap;
		if (leap)
			clock_was_set_delayed();
	}

	timekeeping_update(false);
}

FILE: kernel/time/timekeeping.c
/**
 * timekeeper_setup_internals - Set up internals to use clocksource clock.
 *
 * @clock:		Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void timekeeper_setup_internals(struct clocksource *clock)
{
	cycle_t interval;
	u64 tmp, ntpinterval;

	timekeeper.clock = clock;
	clock->cycle_last = clock->read(clock);
以下略
この関数でセットしている。張り替えもできる模様。
static int change_clocksource(void *data)
FILE: kernel/time/timekeeping.c
/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
	struct clocksource *clock;
	unsigned long flags;
	struct timespec now, boot;

	read_persistent_clock(&now);
	read_boot_clock(&boot);

	write_seqlock_irqsave(&xtime_lock, flags);

	ntp_init();

	clock = clocksource_default_clock();
	if (clock->enable)
		clock->enable(clock);
	timekeeper_setup_internals(clock);
...
おうふ...
clocksource_default_clock()のデフォルトは jiffiesを返すだけの論理的なクロックソースだ。
kernel/time/jiffies.c

ドライバの初期化処理で、タイマリソースを追加して、張り替える処理が走っていると想像。
/**
 * timekeeping_notify - Install a new clock source
 * @clock:		pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
void timekeeping_notify(struct clocksource *clock)
{
	if (timekeeper.clock == clock)
		return;
	stop_machine(change_clocksource, clock, NULL);
	tick_clock_notify();
}
これか. inline関数になってる。SMPとSTOP_MACHINEが定義されていなければ関数呼び出しに置換される。
FILE: include/linux/stop_machine.h
change_clocksource( clock );


FILE: kernel/time/clocksource.c
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET

/**
 * clocksource_select - Select the best clocksource available
 *
 * Private function. Must hold clocksource_mutex when called.
 *
 * Select the clocksource with the best rating, or the clocksource,
 * which is selected by userspace override.
 */
static void clocksource_select(void)
コレ呼び出しているところが多数ある。。
clocksource_listの先頭がbest ratingになるらしい?
→追加するときに大きい物順になるようにコーディングしてある。
static void clocksource_enqueue(struct clocksource *cs)


カウントオーバーするまでに、起き上がる保証は?

なんとなく、登録情報で上限が定まるから、ここからタイマーを張っているような気がする。
タイマを貼ってしまったらTicklessじゃないような気がしないでもないよなぁ。

FILE: include/linux/clocksource.h
/*
 * Don't call __clocksource_register_scale directly, use
 * clocksource_register_hz/khz
 */
extern int
__clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq);
extern void
__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq);

static inline int clocksource_register_hz(struct clocksource *cs, u32 hz)
{
	return __clocksource_register_scale(cs, 1, hz);
}
FILE: kernel/time/clocksource.c
/**
 * __clocksource_register_scale - Used to install new clocksources
 * @t:		clocksource to be registered
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 *
 * This *SHOULD NOT* be called directly! Please use the
 * clocksource_register_hz() or clocksource_register_khz helper functions.
 */
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{

	/* Initialize mult/shift and max_idle_ns */
	__clocksource_updatefreq_scale(cs, scale, freq);   // ★★★ここでcs->max_idle_nsに最長時間を設定してくれる。

	/* Add clocksource to the clcoksource list */
	mutex_lock(&clocksource_mutex);
	clocksource_enqueue(cs);
	clocksource_enqueue_watchdog(cs);
	clocksource_select();
	mutex_unlock(&clocksource_mutex);
	return 0;
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
この最長時間を返す関数が以下にある。(timerクロックリソースを保持するオブジェクト(ファイルスコープ))
FILE: kernel/time/timekeeping.c
u64 timekeeping_max_deferment(void)
これを使っていて、スケジューラ絡みのソースは以下。
FILE: kernel/time/tick-sched.c
/**
 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
 *
 * When the next event is more than a tick into the future, stop the idle tick
 * Called either from the idle loop or from irq_exit() when an idle period was
 * just interrupted by an interrupt which did not cause a reschedule.
 */
void tick_nohz_stop_sched_tick(int inidle)
{

		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
			hrtimer_start(&ts->sched_timer, expires,
				      HRTIMER_MODE_ABS_PINNED);
			/* Check, if the timer was already in the past */
			if (hrtimer_active(&ts->sched_timer))
				goto out;
		} else if (!tick_program_event(expires, 0))
				goto out;
tickを止める前に、タイマを貼ってますなぁ。ハイレゾが有効ならそれ。違えば普通の。

ハイレゾタイマが存在している場合。
FILE: kernel/hrtimer.c
/**
 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
 * @timer:	the timer to be added
 * @tim:	expiry time
 * @delta_ns:	"slack" range for the timer
 * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 *
 * Returns:
 *  0 on success
 *  1 when the timer was active
 */
int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
		unsigned long delta_ns, const enum hrtimer_mode mode)
{
	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
"nohz_mode = NOHZ_MODE_HIGHRES"を設定しているのは、以下。
/**
 * tick_setup_sched_timer - setup the tick emulation timer
 */
void tick_setup_sched_timer(void)
{
	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
	ktime_t now = ktime_get();

	/*
	 * Emulate tick processing via per-CPU hrtimers:
	 */
	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
	ts->sched_timer.function = tick_sched_timer;

	/* Get the next period (per cpu) */
	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

	for (;;) {
		hrtimer_forward(&ts->sched_timer, now, tick_period);
		hrtimer_start_expires(&ts->sched_timer,
				      HRTIMER_MODE_ABS_PINNED);
		/* Check, if the timer was already in the past */
		if (hrtimer_active(&ts->sched_timer))
			break;
		now = ktime_get();
	}

#ifdef CONFIG_NO_HZ
	if (tick_nohz_enabled) {
		ts->nohz_mode = NOHZ_MODE_HIGHRES;
		printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
	}
#endif
}
#endif /* HIGH_RES_TIMERS */

普通の。
/**
 * tick_program_event
 */
int tick_program_event(ktime_t expires, int force)
{
	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);

	return tick_dev_program_event(dev, expires, force);
}

タイマリソースの張替えが自由にできてしまうので、
AMPで同一カウンタを参照するような実装をしたければ、
抜けないようにしておくのが良い。stop requestでnack返せばいいかなぁ?(要確認)





ハイレゾタイマがあれば、それを使ってしまうぽい?
FILE: kernel/timer.c
/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static void run_timer_softirq(struct softirq_action *h)
{
	struct tvec_base *base = __this_cpu_read(tvec_bases);

	hrtimer_run_pending();

	if (time_after_eq(jiffies, base->timer_jiffies))
		__run_timers(base);
}
void __init init_timers(void)
{
	int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
				(void *)(long)smp_processor_id());

	init_timer_stats();

	BUG_ON(err != NOTIFY_OK);
	register_cpu_notifier(&timers_nb);
	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}

FILE: kernel/timer.c
/*
 * Called from timer softirq every jiffy, expire hrtimers:
 *
 * For HRT its the fall back code to run the softirq in the timer
 * softirq context in case the hrtimer initialization failed or has
 * not been done yet.
 */
void hrtimer_run_pending(void)
{
	if (hrtimer_hres_active())   //★★ ->hres_activeを返す。
		return;

	/*
	 * This _is_ ugly: We have to check in the softirq context,
	 * whether we can switch to highres and / or nohz mode. The
	 * clocksource switch happens in the timer interrupt with
	 * xtime_lock held. Notification from there only sets the
	 * check bit in the tick_oneshot code, otherwise we might
	 * deadlock vs. xtime_lock.
	 */
	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) // hrtimer_is_hres_enabled()は真になりやすい(hrtimer_hres_enabledを返す)
		hrtimer_switch_to_hres();
}
hrtimer_hres_active()が
/*
 * Switch to high resolution mode
 */
static int hrtimer_switch_to_hres(void)
{
	int i, cpu = smp_processor_id();
	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
	unsigned long flags;

	if (base->hres_active)
		return 1;

	local_irq_save(flags);

	if (tick_init_highres()) {
		local_irq_restore(flags);
		printk(KERN_WARNING "Could not switch to high resolution "
				    "mode on CPU %d\n", cpu);
		return 0;
	}
	base->hres_active = 1;
	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
		base->clock_base[i].resolution = KTIME_HIGH_RES;

	tick_setup_sched_timer();
	/* "Retrigger" the interrupt to get things going */
	retrigger_next_event(NULL);
	local_irq_restore(flags);
	return 1;
}
clocksourceとclockeventとhrtimerと。
clockeventがhrtimerの代わりになれる?
clocksourceは、永続的に回るカウンタで、NO_HZに必要、か。
起き上がるのに別途タイマが必要になってくる。
(compare "以上"で割込みが出せるなら、一発でいける、か?)
hrtimerとしては登録しない

参考
 http://www.spinics.net/lists/linux-rt-users/msg06427.html


clocksource
 カレンダ時計みたいなもん。NO_HZで起き上がった時にデルタを見るために使う。
登録したratioの最大値のものを採用する(名前を指定して動的にもとれる)
デフォルトがjiffiesになっているので、昔ながらの定周期割込みでも運用できるぽい。

clockevent
 hrtimerにも使われる(どこでそうしているか未確認)
 属性をしっかりと設定して、PERIODIC,ONESHOTがアレば良さそう。
 インタフェースと使われ方を照会して、インプリする。

 ドキュメントはあまりない
(2013/03のコミットで、timerまわりの設計思想はthomasが書いているけど、ドライバの実装の話はナシ)



これ、ticklessだと?無効ですよ..
FILE: arm/kernel/time.c
#ifndef CONFIG_GENERIC_CLOCKEVENTS
/*
 * Kernel system timer support.
 */
void timer_tick(void)
{
	profile_tick(CPU_PROFILING);
	do_leds();
	xtime_update(1);
#ifndef CONFIG_SMP
	update_process_times(user_mode(get_irq_regs()));
#endif
}
#endif


別件

ARMのbootメモは、以下も詳しい。なぞる感じになるなぁ
http://amitshah.bizhat.com/arm/arm_linux_boot-1.html