Issue
I was debugging linux boot and tried to understand how these percpu
variables work in arm64. For test, I added a function called read_pkcontext1
which returns the percpu variable printk_context
. (This value is used for printk) And I found something I can't understand.
(this is from linux 5.4.21)
==== kernel/printk/printk_safe.c ====
int read_pkcontext1(void) /* function I added for test */
{
return this_cpu_read(printk_context);
}
==== include/linux/percpu-defs.h ====
/*
* Operations with implied preemption/interrupt protection. These
* operations can be used without worrying about preemption or interrupt.
*/
#define this_cpu_read(pcp) __pcpu_size_call_return(this_cpu_read_, pcp)
==== include/linux/percpu-defs.h ====
#define __pcpu_size_call_return(stem, variable) \
({ \
typeof(variable) pscr_ret__; \
__verify_pcpu_ptr(&(variable)); \
switch(sizeof(variable)) { \
case 1: pscr_ret__ = stem##1(variable); break; \
case 2: pscr_ret__ = stem##2(variable); break; \
case 4: pscr_ret__ = stem##4(variable); break; \
case 8: pscr_ret__ = stem##8(variable); break; \
default: \
__bad_size_call_parameter(); break; \
} \
pscr_ret__; \
})
This is result of aarch64-none-elf-objdump -S vmlinux
for read_pkcontext1
function and the used funtions inside (with optimization off).
ffffffc0100f0dc0 <read_pkcontext1>:
void write_pkcontext(void);
#pragma GCC push_options
#pragma GCC optimize ("O0")
int read_pkcontext1(void)
{
ffffffc0100f0dc0: a9bd7bfd stp x29, x30, [sp, #-48]!
ffffffc0100f0dc4: 910003fd mov x29, sp
return this_cpu_read(printk_context);
ffffffc0100f0dc8: f9000fff str xzr, [sp, #24]
ffffffc0100f0dcc: 52800020 mov w0, #0x1 // #1
ffffffc0100f0dd0: 94000018 bl ffffffc0100f0e30 <__preempt_count_add>
... skip ...
ffffffc0100f0e30 <__preempt_count_add>:
ffffffc0100f0e30: d5384101 mrs x1, sp_el0
ffffffc0100f0e34: b9401022 ldr w2, [x1, #16]
pc += val;
ffffffc0100f0e38: 0b020000 add w0, w0, w2
case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
ffffffc0100f0e3c: b9001020 str w0, [x1, #16]
}
ffffffc0100f0e40: d65f03c0 ret
In the code above, it calls __preempt_count_add with w0 = #1 (incrementing preemption count), and __preempt_count_add functions is adding the value (w0) to a variable at sp + #16 and writes it back. So this variable in stack looks like the preemption count. (I guess this prevents preemption). My quetion is : when was this value in stack defined and initialized? I couldn't find it in the linux source. (using qemu, I see this value is seen to be 1 and incremented to 2 after the __preempt_count_add. Of course it is decrement back to 1 after the percpu variable access.)
Solution
this_cpu_read(printk_context)
expands to:
⇒ __pcpu_size_call_return(this_cpu_read_, printk_context)
⇒
({
typeof(printk_context) pscr_ret__;
__verify_pcpu_ptr(&(printk_context));
switch(sizeof(printk_context)) {
case 1: pscr_ret__ = this_cpu_read_1(printk_context); break;
case 2: pscr_ret__ = this_cpu_read_2(printk_context); break;
case 4: pscr_ret__ = this_cpu_read_4(printk_context); break;
case 8: pscr_ret__ = this_cpu_read_8(printk_context); break;
default:
__bad_size_call_parameter(); break;
}
pscr_ret__;
})
sizeof(printk_context)
is 4, so pscr_ret__ = this_cpu_read_4(printk_context);
.
The this_cpu_read_4()
macro is defined by #include <asm/percpu.h>
:
==== arch/arm64/include/asm/percpu.h ====
#define this_cpu_read_4(pcp) \
_pcp_protect_return(__percpu_read_32, pcp)
#define _pcp_protect_return(op, pcp, args...) \
({ \
typeof(pcp) __retval; \
preempt_disable_notrace(); \
__retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args); \
preempt_enable_notrace(); \
__retval; \
})
That is where the preempt count manipulation occurs.
The preempt_disable_notrace()
and preempt_enable_notrace()
macros are defined by #include <linux/preempt.h>
.
==== include/linux/preempt.h ====
#define preempt_enable_notrace() \
do { \
barrier(); \
__preempt_count_dec(); \
} while (0)
#define preempt_disable_notrace() \
do { \
__preempt_count_inc(); \
barrier(); \
} while (0)
#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)
__preempt_count_add()
and __preempt_count_sub()
are defined by #include <asm/preempt.h>
.
==== arch/arm64/include/asm/preempt.h ====
static inline void __preempt_count_add(int val)
{
u32 pc = READ_ONCE(current_thread_info()->preempt.count);
pc += val;
WRITE_ONCE(current_thread_info()->preempt.count, pc);
}
static inline void __preempt_count_sub(int val)
{
u32 pc = READ_ONCE(current_thread_info()->preempt.count);
pc -= val;
WRITE_ONCE(current_thread_info()->preempt.count, pc);
}
For arm64, CONFIG_THREAD_INFO_IN_TASK
is enabled so current_thread_info()
is defined as a macro by #include <linux/thread_info.h>
.
==== include/linux/thread_info.h ====
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
* definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
* including <asm/current.h> can cause a circular dependency on some platforms.
*/
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif
The current
macro is defined by #include <asm/current.h>
.
==== arch/arm64/include/asm/current.h ====
#define current get_current()
/*
* We don't use read_sysreg() as we want the compiler to cache the value where
* possible.
*/
static __always_inline struct task_struct *get_current(void)
{
unsigned long sp_el0;
asm ("mrs %0, sp_el0" : "=r" (sp_el0));
return (struct task_struct *)sp_el0;
}
There is some magic in arch/arm64/kernel/entry.S relating to the use of the sp_el0
stack pointer to point to the current thread_info / task_struct. Sorry, I do not have time to study the gory details, but it was introduced by commit 6cdf9c7ca687 ("arm64: Store struct thread_info in sp_el0").
The key thing is that sp_el0
register is not the same as sp
. The kernel does not run in EL0 mode so sp_el0
is available as a "scratch" register. The kernel uses it to point to the current thread_info / task_struct.
struct task_struct
is defined by #include <linux/sched.h>
.
==== include/linux/sched.h ====
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
Since CONFIG_THREAD_INFO_IN_TASK
is selected, the first member is struct thread_info thread_info
. current_thread_info()
points to that member in the current task.
struct thread_info
is defined by #include <asm/thread_info.h>
.
==== arch/arm64/include/asm/thread_info.h ====
/*
* low level task data that entry.S needs immediate access to.
*/
struct thread_info {
unsigned long flags; /* low level flags */
mm_segment_t addr_limit; /* address limit */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
#ifdef CONFIG_CPU_BIG_ENDIAN
u32 need_resched;
u32 count;
#else
u32 count;
u32 need_resched;
#endif
} preempt;
};
};
When CONFIG_ARM64_SW_TTBR0_PAN
is not selected and the CPU is little-endian, the preempt.count
member will be at offset 16 from the start of the structure.
Answered By - Ian Abbott Answer Checked By - Senaida (WPSolving Volunteer)