2012/08/15(水)[ppc32]do_IRQまでの道のり(暫定メモ)

この記事は、めっさ中途半端です。
個人メモとして引用したり妄想事を書いています。
マクロ展開も脳内処理なので、適切なconfigurationでcompilerを通すと
期待していない出力を得ることがあります。

あくまでも自己責任で参考にお使いください。
コードを読んで、こうですよ、と判ればシェアいただけると幸いです。

powerpc architectureの割り込みハンドラを追う

ことの発端は、kernel2.6.33.9+rt31で、ベンダパッチやオリジナルのドライバを運用したときに
BUG()などに引っかかっていたのが要因。
in_atomic()が真で、schedule()関数が呼ばれていたのが問題という判定。
その経緯を追ってみると、spinlock()からきていたり、もともとはkememalloc()だったり。
ディスパッチしない要求でまわってきていたのに、さすがにおかしいので、まぁちょっと
このあたりの詳細を依存部に頼っても明確にしていこうと考えたわけですね。
ある程度見て、主要なarchのものも同様に見ていくと、どういうモードでどこを走行するのかが
わかるかと思います。

割り込みハンドラ

PowerPC e500アーキについて考えます
powerpcの割り込みヴェクタ

kernel bootからdoIRQまで。

ここから先頭になりますね。割り込みヴェクタの登録などやっているので追いかけましょう。
@linux-2.6.33.9/arch/powerpc/kernel/head_fsl_booke.S
_ENTRY(_stext);
_ENTRY(_start);

_ENTRY(__early_start)
 ...

	/* Establish the interrupt vector offsets */
	SET_IVOR(0,  CriticalInput);
	SET_IVOR(1,  MachineCheck);
	SET_IVOR(2,  DataStorage);
	SET_IVOR(3,  InstructionStorage);
	SET_IVOR(4,  ExternalInput);
	SET_IVOR(5,  Alignment);
	SET_IVOR(6,  Program);
	SET_IVOR(7,  FloatingPointUnavailable);
	SET_IVOR(8,  SystemCall);
	SET_IVOR(9,  AuxillaryProcessorUnavailable);
	SET_IVOR(10, Decrementer);
	SET_IVOR(11, FixedIntervalTimer);
	SET_IVOR(12, WatchdogTimer);
	SET_IVOR(13, DataTLBError);
	SET_IVOR(14, InstructionTLBError);
	SET_IVOR(15, DebugCrit);

	/* Establish the interrupt vector base */
	lis	r4,interrupt_base@h	/* IVPR only uses the high 16-bits */
	mtspr	SPRN_IVPR,r4
/*
 * Interrupt vector entry code
 *
 * The Book E MMUs are always on so we don't need to handle
 * interrupts in real mode as with previous PPC processors. In
 * this case we handle interrupts in the kernel virtual address
 * space.
 *
 * Interrupt vectors are dynamically placed relative to the
 * interrupt prefix as determined by the address of interrupt_base.
 * The interrupt vectors offsets are programmed using the labels
 * for each interrupt vector entry.
 *
 * Interrupt vectors must be aligned on a 16 byte boundary.
 * We align on a 32 byte cache line boundary for good measure.
 */

interrupt_base:
	/* Critical Input Interrupt */
	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)

	/* Machine Check Interrupt */
#ifdef CONFIG_E200
	/* no RFMCI, MCSRRs on E200 */
	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
#else
	MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
#endif


	/* Data Storage Interrupt */
	START_EXCEPTION(DataStorage)
	NORMAL_EXCEPTION_PROLOG
	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
	stw	r5,_ESR(r11)
	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
	andis.	r10,r5,(ESR_ILK|ESR_DLK)@h
	bne	1f
	EXC_XFER_EE_LITE(0x0300, handle_page_fault)
1:
	addi	r3,r1,STACK_FRAME_OVERHEAD
	EXC_XFER_EE_LITE(0x0300, CacheLockingException)

	/* Instruction Storage Interrupt */
	INSTRUCTION_STORAGE_EXCEPTION

	/* External Input Interrupt */
	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)

	/* Alignment Interrupt */
	ALIGNMENT_EXCEPTION

	/* Program Interrupt */
	PROGRAM_EXCEPTION

	/* Floating Point Unavailable Interrupt */
#ifdef CONFIG_PPC_FPU
	FP_UNAVAILABLE_EXCEPTION
#else
#ifdef CONFIG_E200
	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
	EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE)
#else
	EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
#endif
#endif

	/* System Call Interrupt */
	START_EXCEPTION(SystemCall)
	NORMAL_EXCEPTION_PROLOG
	EXC_XFER_EE_LITE(0x0c00, DoSyscall)


@linux-2.6.33.9/arch/powerpc/kernel/head_booke.h
これも、ppcのアーキテクチャによって異なるので、注意な。
/*
 * Exception vectors.
 */
#define	START_EXCEPTION(label)						     \
        .align 5;              						     \
label:

#define FINISH_EXCEPTION(func)					\
	bl	transfer_to_handler_full;			\
	.long	func;						\
	.long	ret_from_except_full

#define EXCEPTION(n, label, hdlr, xfer)				\
	START_EXCEPTION(label);					\
	NORMAL_EXCEPTION_PROLOG;				\
	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
	xfer(n, hdlr)

まとめ
EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
展開する
// START_EXCEPTION(ExternalInput)
.align 5;
ExternalInput:

// NORMAL_EXCEPTION_PROLOG
	mtspr	SPRN_SPRG_WSCRATCH0,r10;	/* save two registers to work with */
	mtspr	SPRN_SPRG_WSCRATCH1,r11;
	mtspr	SPRN_SPRG_WSCRATCH2,r1;
//退避:{SPR0,SPR1,SPR4} <= {r10, r11, r1}

	mfcr	r10;						/* save CR in r10 for now	   */
	mfspr	r11,SPRN_SRR1;				/* check whether user or kernel    */
 // SRR1には machine statusが保存されるらしい.@"2.7.1.1 Save/Restore Register 0/1 (SRR0 and SRR1)"
	andi.	r11,r11,MSR_PR;
	beq	1f;
	// ユーザモード時の処理
	 mfspr	r1,SPRN_SPRG_THREAD;		/* if from user, start at top of   */
	 lwz	r1,THREAD_INFO-THREAD(r1);		/* this thread's kernel stack   */
	 ALLOC_STACK_FRAME(r1, THREAD_SIZE);	// (1<<13) @ スタック生成?

	 // Super Visor modeで割り込まれたとき
1:	subi	r1,r1,INT_FRAME_SIZE;		/* Allocate an exception frame     */
	mr	r11,r1;
	stw	r10,_CCR(r11);					/* save various registers	   */
	stw	r12,GPR12(r11);
	stw	r9,GPR9(r11);
	mfspr	r10,SPRN_SPRG_RSCRATCH0;
	stw	r10,GPR10(r11);
	mfspr	r12,SPRN_SPRG_RSCRATCH1;
	stw	r12,GPR11(r11);
	mflr	r10;
	stw	r10,_LINK(r11);
	mfspr	r10,SPRN_SPRG_RSCRATCH2;
	mfspr	r12,SPRN_SRR0;
	stw	r10,GPR1(r11);
	mfspr	r9,SPRN_SRR1;
	stw	r10,0(r11);
	rlwinm	r9,r9,0,14,12;				/* clear MSR_WE (necessary?)	   */
	stw	r0,GPR0(r11);
	SAVE_4GPRS(3, r11);
	SAVE_2GPRS(7, r11)

	addi	r3,r1,STACK_FRAME_OVERHEAD

	xfer(n, hdlr)
User mode (problem state)
 0: The processor is in supervisor mode, can execute any instruction,
  and can access any resource (for example, GPRs, SPRs, and the MSR).
 1: The processor is in user mode,
  cannot execute any privileged instruction, and cannot access any privileged resource.

 PR also affects memory access control.

SPRxの使いかたがコメントにかかれている。R/Wの話もかかれているので便利ね〜。
 * All 32-bit:
 *	- SPRG3 current thread_info pointer
 *        (virtual on BookE, physical on others)
 * 32-bit 440 and FSL BookE:
 *	- SPRG0 scratch for exception vectors
 *	- SPRG1 scratch for exception vectors (*)
 *	- SPRG2 scratch for crit interrupts handler
 *	- SPRG4 scratch for exception vectors
 *	- SPRG5 scratch for exception vectors
 *	- SPRG6 scratch for machine check handler
 *	- SPRG7 scratch for exception vectors
 *	- SPRG9 scratch for debug vectors (e500 only)
 *
 *      Additionally, BookE separates "read" and "write"
 *      of those registers. That allows to use the userspace
 *      readable variant for reads, which can avoid a fault
 *      with KVM type virtualization.


@linux-2.6.33.9/arch/powerpc/include/asm/thread_info.h
/* We have 8k stacks on ppc32 and 16k on ppc64 */

#if defined(CONFIG_PPC64)
#define THREAD_SHIFT		14
#elif defined(CONFIG_PPC_256K_PAGES)
#define THREAD_SHIFT		15
#else
#define THREAD_SHIFT		13
#endif

#define THREAD_SIZE		(1 << THREAD_SHIFT)
@linux-2.6.33.9/arch/powerpc/include/asm/reg.h
#ifdef CONFIG_BOOKE
x#define SPRN_SPRG_WSCRATCH0	SPRN_SPRG0
#define SPRN_SPRG_WSCRATCH1	SPRN_SPRG1
#define SPRN_SPRG_WSCRATCH2	SPRN_SPRG4W
Special Purpose Register Generalは、0-2,3はSupervisor mode用。3だけインプリマター。
4-7はUser read only, Supervisor modeはRead/Write.
と、これだけだとSPR addressでread/writeを湧けていると読み取れないのだが、
ヘッダファイルを見る限りは書き込みは高位アドレス、読み出しはSuperVisorでも低位アドレスを使うようだな。

FILE: arch/powerpc/kernel/head_fsl_booke.S
	/* External Input Interrupt */
	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
	/* System Call Interrupt */
	START_EXCEPTION(SystemCall)
	NORMAL_EXCEPTION_PROLOG
	EXC_XFER_EE_LITE(0x0c00, DoSyscall)

/*
 * Exception vectors.
 */
#define	START_EXCEPTION(label)						     \
        .align 5;              						     \
label:

//#define NORMAL_EXCEPTION_PROLOG
	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */
	mfspr	r10, SPRN_SPRG_THREAD;
	stw	r11, THREAD_NORMSAVE(0)(r10);
	stw	r13, THREAD_NORMSAVE(2)(r10);
	mfcr	r13;			/* save CR in r13 for now	   */
	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */
	andi.	r11,r11,MSR_PR;
	mr	r11, r1;
	beq	1f;
	/* if from user, start at top of this thread's kernel stack */
	lwz	r11, THREAD_INFO-THREAD(r10);
	ALLOC_STACK_FRAME(r11, THREAD_SIZE);
1 :	subi	r11, r11, INT_FRAME_SIZE; /* Allocate exception frame(pt_regs構造体+α) */

	stw	r13, _CCR(r11);		/* save various registers */
	stw	r12,GPR12(r11);
	stw	r9,GPR9(r11);
	mfspr	r13, SPRN_SPRG_RSCRATCH0;
	stw	r13, GPR10(r11);
	lwz	r12, THREAD_NORMSAVE(0)(r10);	// このマクロがない?
	stw	r12,GPR11(r11);
	lwz	r13, THREAD_NORMSAVE(2)(r10); /* restore r13 */
	mflr	r10;
	stw	r10,_LINK(r11);
	mfspr	r12,SPRN_SRR0;		// transfer_to_handler引数:戻り先のIP.
	stw	r1, GPR1(r11);
	mfspr	r9,SPRN_SRR1;
	stw	r1, 0(r11);
	mr	r1, r11;		// r1 <= r11 as SP
	rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)  */ // ぱわまね殺すのに必要なのでクリアしてくださいな.
	stw	r0,GPR0(r11);
	SAVE_4GPRS(3, r11);	// store r3,r4,r5,r6
	SAVE_2GPRS(7, r11)	// store r7,r8


	addi	r3,r1,STACK_FRAME_OVERHEAD;

	li	r10,trap;	// 0x0500+1 or 
	stw	r10,_TRAP(r11);
	lis	r10,MSR_KERNEL@h;
	ori	r10,r10,MSR_KERNEL@l;
	NOCOPY(r10, r9);	// no instruction.
	bl	transfer_to_handler // r11=sp=p_reg
	.long	hdlr;
	.long	ret



 => EXC_XFER_LITE(0x0500,do_IRQ)
EXC_XFER_TEMPLATE(do_IRQ, 0x0500+1, MSR_KERNEL, NOCOPY, transfer_to_handler, ret_from_except)

#define EXCEPTION(n, label, hdlr, xfer)				\
	START_EXCEPTION(label);					\
	NORMAL_EXCEPTION_PROLOG;				\
	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
	xfer(n, hdlr)



#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)	\
	li	r10,trap;					\
	stw	r10,_TRAP(r11);					\
	lis	r10,msr@h;					\
	ori	r10,r10,msr@l;					\
	copyee(r10, r9);					\
	bl	tfer;		 				\
	.long	hdlr;						\
	.long	ret

#define EXC_XFER_LITE(n, hdlr)		\
	EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
			  ret_from_except)


@linux-2.6.33.9/arch/powerpc/kernel/entry_32.S
割り込みハンドラで使うハンドラぽいやつ。
/*
 * This code finishes saving the registers to the exception frame
 * and jumps to the appropriate handler for the exception, turning
 * on address translation.
 * Note that we rely on the caller having set cr0.eq iff the exception
 * occurred in kernel mode (i.e. MSR:PR = 0).
 */
	.globl	transfer_to_handler_full
transfer_to_handler_full:
	SAVE_NVGPRS(r11)
	/* fall through */

	.globl	transfer_to_handler
 // r2=MSRを置いておく. r11:レジスタ構造体へのポインタ, r12:NIP
transfer_to_handler:
	stw	r2,GPR2(r11)
	stw	r12,_NIP(r11)	// 割り込みから返るアドレスを書き戻す
	stw	r9,_MSR(r11)
	andi.	r2,r9,MSR_PR	// clear PR: Set SuperVisor mode
	mfctr	r12		// r12 <= ctr(count register)
	mfspr	r2,SPRN_XER	// "Integer Exception Register"
	stw	r12,_CTR(r11)
	stw	r2,_XER(r11)
	mfspr	r12,SPRN_SPRG_THREAD
	addi	r2,r12,-THREAD
	tovirt(r2,r2)			/* set r2 to current */
	beq	2f			/* if from user, fix up THREAD.regs */
	addi	r11,r1,STACK_FRAME_OVERHEAD
	stw	r11,PT_REGS(r12)
#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
	/* Check to see if the dbcr0 register is set up to debug.  Use the
	   internal debug mode bit to do this. */
	lwz	r12,THREAD_DBCR0(r12)
	andis.	r12,r12,DBCR0_IDM@h
	beq+	3f
	/* From user and task is ptraced - load up global dbcr0 */
	li	r12,-1			/* clear all pending debug events */
	mtspr	SPRN_DBSR,r12
	lis	r11,global_dbcr0@ha
	tophys(r11,r11)
	addi	r11,r11,global_dbcr0@l
#ifdef CONFIG_SMP
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r9,TI_CPU(r9)
	slwi	r9,r9,3
	add	r11,r11,r9
#endif
	lwz	r12,0(r11)
	mtspr	SPRN_DBCR0,r12
	lwz	r12,4(r11)
	addi	r12,r12,-1
	stw	r12,4(r11)
#endif
	b	3f

2:	/* if from kernel, check interrupted DOZE/NAP mode and
         * check for stack overflow
         */
	lwz	r9,KSP_LIMIT(r12)
	cmplw	r1,r9			/* if r1 <= ksp_limit */
	ble-	stack_ovf		/* then the kernel stack overflowed */
5:
#if defined(CONFIG_6xx) || defined(CONFIG_E500)
	rlwinm	r9,r1,0,0,31-THREAD_SHIFT
	tophys(r9,r9)			/* check local flags */
	lwz	r12,TI_LOCAL_FLAGS(r9)
	mtcrf	0x01,r12
	bt-	31-TLF_NAPPING,4f
	bt-	31-TLF_SLEEPING,7f
#endif /* CONFIG_6xx || CONFIG_E500 */
	.globl transfer_to_handler_cont
transfer_to_handler_cont:
3:
	mflr	r9		// blで飛んできているので, LRにはhandlerアドレス,その次に戻りアドレスがある.
	lwz	r11,0(r9)		/* virtual address of handler */
	lwz	r9,4(r9)		/* where to go when done */
#ifdef CONFIG_TRACE_IRQFLAGS
	lis	r12,reenable_mmu@h
	ori	r12,r12,reenable_mmu@l
	mtspr	SPRN_SRR0,r12		// reenable_mmuのアドレス
	mtspr	SPRN_SRR1,r10		// MSR_KERNELの値((MSR_ME|MSR_RI|MSR_CE))
	SYNC
	RFI
reenable_mmu:				/* re-enable mmu so we can */
	mfmsr	r10
	lwz	r12,_MSR(r1)
	xor	r10,r10,r12
	andi.	r10,r10,MSR_EE		/* Did EE change? */
	beq	1f
 // EEの変化があれば、処理に入る。デフォルトはEE=0なので, EE=1の状態に割り込んだ場合やね。
	/* Save handler and return address into the 2 unused words
	 * of the STACK_FRAME_OVERHEAD (sneak sneak sneak). Everything
	 * else can be recovered from the pt_regs except r3 which for
	 * normal interrupts has been set to pt_regs and for syscalls
	 * is an argument, so we temporarily use ORIG_GPR3 to save it
	 */
  // ABI確認要:レジスタ退避の目的
	stw	r9,8(r1)
	stw	r11,12(r1)
	stw	r3,ORIG_GPR3(r1)
	bl	trace_hardirqs_off	// trace遊行時のみ存在する。他は空っぽ
	lwz	r0,GPR0(r1)
	lwz	r3,ORIG_GPR3(r1)
	lwz	r4,GPR4(r1)
	lwz	r5,GPR5(r1)
	lwz	r6,GPR6(r1)
	lwz	r7,GPR7(r1)
	lwz	r8,GPR8(r1)
	lwz	r9,8(r1)
	lwz	r11,12(r1)

1:	mtctr	r11
	mtlr	r9
	bctr				/* jump to handler */
#else /* CONFIG_TRACE_IRQFLAGS */
	mtspr	SPRN_SRR0,r11
	mtspr	SPRN_SRR1,r10
	mtlr	r9
	SYNC
	RFI				/* jump to handler, enable MMU */
#endif /* CONFIG_TRACE_IRQFLAGS */

#if defined (CONFIG_6xx) || defined(CONFIG_E500)
4:	rlwinm	r12,r12,0,~_TLF_NAPPING
	stw	r12,TI_LOCAL_FLAGS(r9)
	b	power_save_ppc32_restore

7:	rlwinm	r12,r12,0,~_TLF_SLEEPING
	stw	r12,TI_LOCAL_FLAGS(r9)
	lwz	r9,_MSR(r11)		/* if sleeping, clear MSR.EE */
	rlwinm	r9,r9,0,~MSR_EE
	lwz	r12,_LINK(r11)		/* and return to address in LR */
	b	fast_exception_return
#endif


@arch/powerpc/include/asm/ptrace.h
#define STACK_FRAME_OVERHEAD	16	/* size of minimum stack frame */
#define STACK_FRAME_LR_SAVE	1	/* Location of LR in stack frame */
#define STACK_FRAME_REGS_MARKER	ASM_CONST(0x72656773)
#define STACK_INT_FRAME_SIZE	(sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD)
#define STACK_FRAME_MARKER	2

/* Size of stack frame allocated when calling signal handler. */
#define __SIGNAL_FRAMESIZE	64
struct pt_regs {
	unsigned long gpr[32];
	unsigned long nip;
	unsigned long msr;
	unsigned long orig_gpr3;	/* Used for restarting system calls */
	unsigned long ctr;
	unsigned long link;
	unsigned long xer;
	unsigned long ccr;
#ifdef __powerpc64__
	unsigned long softe;		/* Soft enabled/disabled */
#else
	unsigned long mq;		/* 601 only (not used at present) */
					/* Used on APUS to hold IPL value. */
#endif
	unsigned long trap;		/* Reason for being here */
	/* N.B. for critical exceptions on 4xx, the dar and dsisr
	   fields are overloaded to hold srr0 and srr1. */
	unsigned long dar;		/* Fault registers */
	unsigned long dsisr;		/* on 4xx/Book-E used for ESR */
	unsigned long result;		/* Result of a system call */
};
@arch/powerpc/kernel/asm-offsets.c
	DEFINE(THREAD, offsetof(struct task_struct, thread));

なんちゃってで追いかけると、以下の差分を取り込んでないと納得できないわけだが、
とりあえずスルー。必要なレジスタをタスク構造体にあるスレッド構造体に放りこんでいる、と解釈。
thread_struct内部のkspがいつセットされるのかが気になるわけだがナー。

http://git.opencores.org/?a=commitdiff&p=linux&h=1325a684b553d4b5c41ae0482f8991b43f945746

 rlwinm	r9,r9,0,14,12;		/* clear MSR_WE (necessary?)  */
14..12を 1として、r9とandする. 中のゼロはシフト量ぽい?
14..31, 0..12を1にするから13が0.

SPRメモ

MSR

WE : bit45(LSBは32なので、32bit accessなら bit13な)
Wait state enable. On the e500, this allows the core complex to signal a request for power management,
according to the states of HID0[DOZE], HID0[NAP], and HID0[SLEEP].
 0: The processor is not in wait state and continues processing.
    On the e500, no power management request is signaled to external logic.

 1: The processor enters wait state by ceasing to execute instructions and entering low-power mode.
  Details of how wait state is entered and exited and how the processor behaves in the wait state are implementation dependent.
  On the e500, MSR[WE] gates the DOZE, NAP, and SLEEP outputs from the core complex; as a result, these outputs negate to the external power management logic on entry to the interrupt and then return to their previous state on return from the interrupt.
  WE is cleared on entry to any interrupt and restored to its previous state upon return.
CE : bit 46 Critical Interrupt Enable
Critical enable. Book E defines this bit as an enable for the critical input, watchdog timer, and machine check
interrupts. On the e500, this bit does not affect machine check interrupts.
 0: Critical input and watchdog timer interrupts are disabled.
 1: Critical input and watchdog timer interrupts are enabled.
EE : bit48
External enable
 0: External input, decrementer, fixed-interval timer, and performance monitor interrupts are disabled.
 1: External input, decrementer, fixed-interval timer, and performance monitor interrupts are enabled.
PR : bit 49(bit17)
User mode (problem state)
 0: The processor is in supervisor mode, can execute any instruction, and can access any resource (for example, GPRs, SPRs, and the MSR).
 1: The processor is in user mode, cannot execute any privileged instruction, and cannot access any privileged resource.

PR also affects memory access control.
ME : bit 51 Machine Check Enable
Machine check enable.
 0: Machine check interrupts are disabled. On e500 cores, a machine check condition causes a checkstop.
 1: Machine check interrupts are enabled.
RI : bit 62 Recoverable Exception // should be zeroってかいてあるぜ?
 e500core rmには記載がなく、reserved。ただし、割り込み復帰でクリアするような記載があるので、無視していいか?
@RM: "5.7.5 External Input Interrupt"
EEは降りるし、PRは 0になるので、割り込み禁止で飛んでくる。
CE, ME, and DE are unchanged. All other MSR bits are cleared

で?

綺麗に読み切れてません...
結局、user modeへ割り込む場合は、そのtaskが持つkernel stack
プロセッサ毎に用意したkernel専用 stackを使っているようです。
*1
thread_info構造体は、プロセッサ毎に1つだけ存在するモノで、
プロセス・スレッドのbody?となるtask構造体のポインタを持っている作りのようです。
多重割り込みに耐えられる程度のスタックを用意するとも書いてあるので、
ppcではEE以外のexceptionなどもコレをシェアするのでしょう。
EEに関しては、PICで優先度制御を任せてしまうんですかね。要調査。

とりあえず専用のスタックを用いることは判ったのですが、
x86だと, 奇妙なことが書かれています*2
動作モードに応じたスタックを用意しているとあります。
HARD-IRQ/SOFT-IRQで、それぞれスタックをCPU毎に用意し、例外スタックはprocess毎にあるようです。

例外スタックは、ユーザ空間の実行時エラーを拾うモノなので良いでしょう。


抜ける側のコード

do_IRQ()の返値でscheduleかsignalかナニもせずに戻るか、を選択しているくさい。

@linux-2.6.33.9/arch/powerpc/kernel/entry_32.S

	.globl	ret_from_except_full
ret_from_except_full:
	REST_NVGPRS(r1)
	/* fall through */

	.globl	ret_from_except
ret_from_except:
	/* Hard-disable interrupts so that current_thread_info()->flags
	 * can't change between when we test it and when we return
	 * from the interrupt. */
	/* Note: We don't bother telling lockdep about it */
	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
	SYNC			/* Some chip revs have problems here... */
	MTMSRD(r10)		/* disable interrupts */

	lwz	r3,_MSR(r1)	/* Returning to user mode? */
	andi.	r0,r3,MSR_PR
	beq	resume_kernel

user_exc_return:		/* r10 contains MSR_KERNEL here */
	/* Check current_thread_info()->flags */
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r9,TI_FLAGS(r9)
	andi.	r0,r9,_TIF_USER_WORK_MASK
	bne	do_work

restore_user:
#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
	/* Check whether this process has its own DBCR0 value.  The internal
	   debug mode bit tells us that dbcr0 should be loaded. */
	lwz	r0,THREAD+THREAD_DBCR0(r2)
	andis.	r10,r0,DBCR0_IDM@h
	bnel-	load_dbcr0
#endif

#ifdef CONFIG_PREEMPT
	b	restore

/* N.B. the only way to get here is from the beq following ret_from_except. */
resume_kernel:
	/* check current_thread_info->preempt_count */
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r0,TI_PREEMPT(r9)
	cmpwi	0,r0,0		/* if non-zero, just restore regs and return */
	bne	restore
	lwz	r0,TI_FLAGS(r9)
	andi.	r0,r0,_TIF_NEED_RESCHED
	beq+	restore
	andi.	r0,r3,MSR_EE	/* interrupts off? */
	beq	restore		/* don't schedule if so */
#ifdef CONFIG_TRACE_IRQFLAGS
	/* Lockdep thinks irqs are enabled, we need to call
	 * preempt_schedule_irq with IRQs off, so we inform lockdep
	 * now that we -did- turn them off already
	 */
	bl	trace_hardirqs_off
#endif
1:	bl	preempt_schedule_irq
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r3,TI_FLAGS(r9)
	andi.	r0,r3,_TIF_NEED_RESCHED
	bne-	1b
#ifdef CONFIG_TRACE_IRQFLAGS
	/* And now, to properly rebalance the above, we tell lockdep they
	 * are being turned back on, which will happen when we return
	 */
	bl	trace_hardirqs_on
#endif
#else
resume_kernel:
#endif /* CONFIG_PREEMPT */

	/* interrupts are hard-disabled at this point */
restore:
#ifdef CONFIG_44x
	lis	r4,icache_44x_need_flush@ha
	lwz	r5,icache_44x_need_flush@l(r4)
	cmplwi	cr0,r5,0
	beq+	1f
	li	r6,0
	iccci	r0,r0
	stw	r6,icache_44x_need_flush@l(r4)
1:
#endif  /* CONFIG_44x */

	lwz	r9,_MSR(r1)
#ifdef CONFIG_TRACE_IRQFLAGS
	/* Lockdep doesn't know about the fact that IRQs are temporarily turned
	 * off in this assembly code while peeking at TI_FLAGS() and such. However
	 * we need to inform it if the exception turned interrupts off, and we
	 * are about to trun them back on.
	 *
	 * The problem here sadly is that we don't know whether the exceptions was
	 * one that turned interrupts off or not. So we always tell lockdep about
	 * turning them on here when we go back to wherever we came from with EE
	 * on, even if that may meen some redudant calls being tracked. Maybe later
	 * we could encode what the exception did somewhere or test the exception
	 * type in the pt_regs but that sounds overkill
	 */
	andi.	r10,r9,MSR_EE
	beq	1f
	bl	trace_hardirqs_on
	lwz	r9,_MSR(r1)
1:
#endif /* CONFIG_TRACE_IRQFLAGS */

	lwz	r0,GPR0(r1)
	lwz	r2,GPR2(r1)
	REST_4GPRS(3, r1)
	REST_2GPRS(7, r1)

	lwz	r10,_XER(r1)
	lwz	r11,_CTR(r1)
	mtspr	SPRN_XER,r10
	mtctr	r11

	PPC405_ERR77(0,r1)
BEGIN_FTR_SECTION
	lwarx	r11,0,r1
END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
	stwcx.	r0,0,r1			/* to clear the reservation */

#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
	andi.	r10,r9,MSR_RI		/* check if this exception occurred */
	beql	nonrecoverable		/* at a bad place (MSR:RI = 0) */

	lwz	r10,_CCR(r1)
	lwz	r11,_LINK(r1)
	mtcrf	0xFF,r10
	mtlr	r11

	/*
	 * Once we put values in SRR0 and SRR1, we are in a state
	 * where exceptions are not recoverable, since taking an
	 * exception will trash SRR0 and SRR1.  Therefore we clear the
	 * MSR:RI bit to indicate this.  If we do take an exception,
	 * we can't return to the point of the exception but we
	 * can restart the exception exit path at the label
	 * exc_exit_restart below.  -- paulus
	 */
	LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI)
	SYNC
	MTMSRD(r10)		/* clear the RI bit */
	.globl exc_exit_restart
exc_exit_restart:
	lwz	r12,_NIP(r1)
	FIX_SRR1(r9,r10)
	mtspr	SPRN_SRR0,r12
	mtspr	SPRN_SRR1,r9
	REST_4GPRS(9, r1)
	lwz	r1,GPR1(r1)
	.globl exc_exit_restart_end
exc_exit_restart_end:
	SYNC
	RFI

#else /* !(CONFIG_4xx || CONFIG_BOOKE) */
	/*
	 * This is a bit different on 4xx/Book-E because it doesn't have
	 * the RI bit in the MSR.
	 * The TLB miss handler checks if we have interrupted
	 * the exception exit path and restarts it if so
	 * (well maybe one day it will... :).
	 */
	lwz	r11,_LINK(r1)
	mtlr	r11
	lwz	r10,_CCR(r1)
	mtcrf	0xff,r10
	REST_2GPRS(9, r1)
	.globl exc_exit_restart
exc_exit_restart:
	lwz	r11,_NIP(r1)
	lwz	r12,_MSR(r1)
exc_exit_start:
	mtspr	SPRN_SRR0,r11
	mtspr	SPRN_SRR1,r12
	REST_2GPRS(11, r1)
	lwz	r1,GPR1(r1)
	.globl exc_exit_restart_end
exc_exit_restart_end:
	PPC405_ERR77_SYNC
	rfi
	b	.			/* prevent prefetch past rfi */

/*
 * Returning from a critical interrupt in user mode doesn't need
 * to be any different from a normal exception.  For a critical
 * interrupt in the kernel, we just return (without checking for
 * preemption) since the interrupt may have happened at some crucial
 * place (e.g. inside the TLB miss handler), and because we will be
 * running with r1 pointing into critical_stack, not the current
 * process's kernel stack (and therefore current_thread_info() will
 * give the wrong answer).
 * We have to restore various SPRs that may have been in use at the
 * time of the critical interrupt.
 *
 */
#ifdef CONFIG_40x
#define PPC_40x_TURN_OFF_MSR_DR						    \
	/* avoid any possible TLB misses here by turning off MSR.DR, we	    \
	 * assume the instructions here are mapped by a pinned TLB entry */ \
	li	r10,MSR_IR;						    \
	mtmsr	r10;							    \
	isync;								    \
	tophys(r1, r1);
#else
#define PPC_40x_TURN_OFF_MSR_DR
#endif

#define RET_FROM_EXC_LEVEL(exc_lvl_srr0, exc_lvl_srr1, exc_lvl_rfi)	\
	REST_NVGPRS(r1);						\
	lwz	r3,_MSR(r1);						\
	andi.	r3,r3,MSR_PR;						\
	LOAD_MSR_KERNEL(r10,MSR_KERNEL);				\
	bne	user_exc_return;					\
	lwz	r0,GPR0(r1);						\
	lwz	r2,GPR2(r1);						\
	REST_4GPRS(3, r1);						\
	REST_2GPRS(7, r1);						\
	lwz	r10,_XER(r1);						\
	lwz	r11,_CTR(r1);						\
	mtspr	SPRN_XER,r10;						\
	mtctr	r11;							\
	PPC405_ERR77(0,r1);						\
	stwcx.	r0,0,r1;		/* to clear the reservation */	\
	lwz	r11,_LINK(r1);						\
	mtlr	r11;							\
	lwz	r10,_CCR(r1);						\
	mtcrf	0xff,r10;						\
	PPC_40x_TURN_OFF_MSR_DR;					\
	lwz	r9,_DEAR(r1);						\
	lwz	r10,_ESR(r1);						\
	mtspr	SPRN_DEAR,r9;						\
	mtspr	SPRN_ESR,r10;						\
	lwz	r11,_NIP(r1);						\
	lwz	r12,_MSR(r1);						\
	mtspr	exc_lvl_srr0,r11;					\
	mtspr	exc_lvl_srr1,r12;					\
	lwz	r9,GPR9(r1);						\
	lwz	r12,GPR12(r1);						\
	lwz	r10,GPR10(r1);						\
	lwz	r11,GPR11(r1);						\
	lwz	r1,GPR1(r1);						\
	PPC405_ERR77_SYNC;						\
	exc_lvl_rfi;							\
	b	.;		/* prevent prefetch past exc_lvl_rfi */

#define	RESTORE_xSRR(exc_lvl_srr0, exc_lvl_srr1)			\
	lwz	r9,_##exc_lvl_srr0(r1);					\
	lwz	r10,_##exc_lvl_srr1(r1);				\
	mtspr	SPRN_##exc_lvl_srr0,r9;					\
	mtspr	SPRN_##exc_lvl_srr1,r10;

#if defined(CONFIG_PPC_BOOK3E_MMU)
#ifdef CONFIG_PHYS_64BIT
#define	RESTORE_MAS7							\
	lwz	r11,MAS7(r1);						\
	mtspr	SPRN_MAS7,r11;
#else
#define	RESTORE_MAS7
#endif /* CONFIG_PHYS_64BIT */
#define RESTORE_MMU_REGS						\
	lwz	r9,MAS0(r1);						\
	lwz	r10,MAS1(r1);						\
	lwz	r11,MAS2(r1);						\
	mtspr	SPRN_MAS0,r9;						\
	lwz	r9,MAS3(r1);						\
	mtspr	SPRN_MAS1,r10;						\
	lwz	r10,MAS6(r1);						\
	mtspr	SPRN_MAS2,r11;						\
	mtspr	SPRN_MAS3,r9;						\
	mtspr	SPRN_MAS6,r10;						\
	RESTORE_MAS7;
#elif defined(CONFIG_44x)
#define RESTORE_MMU_REGS						\
	lwz	r9,MMUCR(r1);						\
	mtspr	SPRN_MMUCR,r9;
#else
#define RESTORE_MMU_REGS
#endif

#ifdef CONFIG_40x
	.globl	ret_from_crit_exc
ret_from_crit_exc:
	mfspr	r9,SPRN_SPRG_THREAD
	lis	r10,saved_ksp_limit@ha;
	lwz	r10,saved_ksp_limit@l(r10);
	tovirt(r9,r9);
	stw	r10,KSP_LIMIT(r9)
	lis	r9,crit_srr0@ha;
	lwz	r9,crit_srr0@l(r9);
	lis	r10,crit_srr1@ha;
	lwz	r10,crit_srr1@l(r10);
	mtspr	SPRN_SRR0,r9;
	mtspr	SPRN_SRR1,r10;
	RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI)
#endif /* CONFIG_40x */

#ifdef CONFIG_BOOKE
	.globl	ret_from_crit_exc
ret_from_crit_exc:
	mfspr	r9,SPRN_SPRG_THREAD
	lwz	r10,SAVED_KSP_LIMIT(r1)
	stw	r10,KSP_LIMIT(r9)
	RESTORE_xSRR(SRR0,SRR1);
	RESTORE_MMU_REGS;
	RET_FROM_EXC_LEVEL(SPRN_CSRR0, SPRN_CSRR1, PPC_RFCI)

	.globl	ret_from_debug_exc
ret_from_debug_exc:
	mfspr	r9,SPRN_SPRG_THREAD
	lwz	r10,SAVED_KSP_LIMIT(r1)
	stw	r10,KSP_LIMIT(r9)
	lwz	r9,THREAD_INFO-THREAD(r9)
	rlwinm	r10,r1,0,0,(31-THREAD_SHIFT)
	lwz	r10,TI_PREEMPT(r10)
	stw	r10,TI_PREEMPT(r9)
	RESTORE_xSRR(SRR0,SRR1);
	RESTORE_xSRR(CSRR0,CSRR1);
	RESTORE_MMU_REGS;
	RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI)

	.globl	ret_from_mcheck_exc
ret_from_mcheck_exc:
	mfspr	r9,SPRN_SPRG_THREAD
	lwz	r10,SAVED_KSP_LIMIT(r1)
	stw	r10,KSP_LIMIT(r9)
	RESTORE_xSRR(SRR0,SRR1);
	RESTORE_xSRR(CSRR0,CSRR1);
	RESTORE_xSRR(DSRR0,DSRR1);
	RESTORE_MMU_REGS;
	RET_FROM_EXC_LEVEL(SPRN_MCSRR0, SPRN_MCSRR1, PPC_RFMCI)
#endif /* CONFIG_BOOKE */

/*
 * Load the DBCR0 value for a task that is being ptraced,
 * having first saved away the global DBCR0.  Note that r0
 * has the dbcr0 value to set upon entry to this.
 */
load_dbcr0:
	mfmsr	r10		/* first disable debug exceptions */
	rlwinm	r10,r10,0,~MSR_DE
	mtmsr	r10
	isync
	mfspr	r10,SPRN_DBCR0
	lis	r11,global_dbcr0@ha
	addi	r11,r11,global_dbcr0@l
#ifdef CONFIG_SMP
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r9,TI_CPU(r9)
	slwi	r9,r9,3
	add	r11,r11,r9
#endif
	stw	r10,0(r11)
	mtspr	SPRN_DBCR0,r0
	lwz	r10,4(r11)
	addi	r10,r10,1
	stw	r10,4(r11)
	li	r11,-1
	mtspr	SPRN_DBSR,r11	/* clear all pending debug events */
	blr

	.section .bss
	.align	4
global_dbcr0:
	.space	8*NR_CPUS
	.previous
#endif /* !(CONFIG_4xx || CONFIG_BOOKE) */

do_work:			/* r10 contains MSR_KERNEL here */
	andi.	r0,r9,_TIF_NEED_RESCHED
	beq	do_user_signal

do_resched:			/* r10 contains MSR_KERNEL here */
	/* Note: We don't need to inform lockdep that we are enabling
	 * interrupts here. As far as it knows, they are already enabled
	 */
	ori	r10,r10,MSR_EE
	SYNC
	MTMSRD(r10)		/* hard-enable interrupts */
	bl	schedule
recheck:
	/* Note: And we don't tell it we are disabling them again
	 * neither. Those disable/enable cycles used to peek at
	 * TI_FLAGS aren't advertised.
	 */
	LOAD_MSR_KERNEL(r10,MSR_KERNEL)
	SYNC
	MTMSRD(r10)		/* disable interrupts */
	rlwinm	r9,r1,0,0,(31-THREAD_SHIFT)
	lwz	r9,TI_FLAGS(r9)
	andi.	r0,r9,_TIF_NEED_RESCHED
	bne-	do_resched
	andi.	r0,r9,_TIF_USER_WORK_MASK
	beq	restore_user
do_user_signal:			/* r10 contains MSR_KERNEL here */
	ori	r10,r10,MSR_EE
	SYNC
	MTMSRD(r10)		/* hard-enable interrupts */
	/* save r13-r31 in the exception frame, if not already done */
	lwz	r3,_TRAP(r1)
	andi.	r0,r3,1
	beq	2f
	SAVE_NVGPRS(r1)
	rlwinm	r3,r3,0,0,30
	stw	r3,_TRAP(r1)
2:	addi	r3,r1,STACK_FRAME_OVERHEAD
	mr	r4,r9
	bl	do_signal
	REST_NVGPRS(r1)
	b	recheck

/*
 * We come here when we are at the end of handling an exception
 * that occurred at a place where taking an exception will lose
 * state information, such as the contents of SRR0 and SRR1.
 */
nonrecoverable:
	lis	r10,exc_exit_restart_end@ha
	addi	r10,r10,exc_exit_restart_end@l
	cmplw	r12,r10
	bge	3f
	lis	r11,exc_exit_restart@ha
	addi	r11,r11,exc_exit_restart@l
	cmplw	r12,r11
	blt	3f
	lis	r10,ee_restarts@ha
	lwz	r12,ee_restarts@l(r10)
	addi	r12,r12,1
	stw	r12,ee_restarts@l(r10)
	mr	r12,r11		/* restart at exc_exit_restart */
	blr
3:	/* OK, we can't recover, kill this process */
	/* but the 601 doesn't implement the RI bit, so assume it's OK */
BEGIN_FTR_SECTION
	blr
END_FTR_SECTION_IFSET(CPU_FTR_601)
	lwz	r3,_TRAP(r1)
	andi.	r0,r3,1
	beq	4f
	SAVE_NVGPRS(r1)
	rlwinm	r3,r3,0,0,30
	stw	r3,_TRAP(r1)
4:	addi	r3,r1,STACK_FRAME_OVERHEAD
	bl	nonrecoverable_exception
	/* shouldn't return */
	b	4b

	.section .bss
	.align	2
ee_restarts:
	.space	4
	.previous

*1 : 詳解Linuxカーネル 第3版 "3.2 プロセスディスクリプタ", "3.3 プロセス切り替え"

*2 : 詳解Linuxカーネル 第3版 "4.6.1.4 複数のカーネルモードスタック"