Saturday, February 26, 2022

[SOLVED] Can't find where preempt count in the stack is declared for a percpu variable access. (linux kernel)

Issue

I was debugging linux boot and tried to understand how these percpu variables work in arm64. For test, I added a function called read_pkcontext1 which returns the percpu variable printk_context. (This value is used for printk) And I found something I can't understand.
(this is from linux 5.4.21)

==== kernel/printk/printk_safe.c ====

int read_pkcontext1(void)   /* function I added for test */
{
    return this_cpu_read(printk_context);
}

==== include/linux/percpu-defs.h ====
/*
 * Operations with implied preemption/interrupt protection.  These
 * operations can be used without worrying about preemption or interrupt.
 */
#define this_cpu_read(pcp)      __pcpu_size_call_return(this_cpu_read_, pcp)

==== include/linux/percpu-defs.h ====
#define __pcpu_size_call_return(stem, variable)             \
({                                  \
    typeof(variable) pscr_ret__;                    \
    __verify_pcpu_ptr(&(variable));                 \
    switch(sizeof(variable)) {                  \
    case 1: pscr_ret__ = stem##1(variable); break;          \
    case 2: pscr_ret__ = stem##2(variable); break;          \
    case 4: pscr_ret__ = stem##4(variable); break;          \
    case 8: pscr_ret__ = stem##8(variable); break;          \
    default:                            \
        __bad_size_call_parameter(); break;         \
    }                               \
    pscr_ret__;                         \
})

This is result of aarch64-none-elf-objdump -S vmlinux for read_pkcontext1 function and the used funtions inside (with optimization off).

ffffffc0100f0dc0 <read_pkcontext1>:
void write_pkcontext(void);

#pragma GCC push_options
#pragma GCC optimize ("O0")
int read_pkcontext1(void)
{
ffffffc0100f0dc0:   a9bd7bfd    stp x29, x30, [sp, #-48]!
ffffffc0100f0dc4:   910003fd    mov x29, sp
    return this_cpu_read(printk_context);
ffffffc0100f0dc8:   f9000fff    str xzr, [sp, #24]
ffffffc0100f0dcc:   52800020    mov w0, #0x1                    // #1
ffffffc0100f0dd0:   94000018    bl  ffffffc0100f0e30 <__preempt_count_add>

... skip ... 

ffffffc0100f0e30 <__preempt_count_add>:
ffffffc0100f0e30:   d5384101    mrs x1, sp_el0
ffffffc0100f0e34:   b9401022    ldr w2, [x1, #16]
    pc += val;
ffffffc0100f0e38:   0b020000    add w0, w0, w2
    case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
ffffffc0100f0e3c:   b9001020    str w0, [x1, #16]
}
ffffffc0100f0e40:   d65f03c0    ret

In the code above, it calls __preempt_count_add with w0 = #1 (incrementing preemption count), and __preempt_count_add functions is adding the value (w0) to a variable at sp + #16 and writes it back. So this variable in stack looks like the preemption count. (I guess this prevents preemption). My quetion is : when was this value in stack defined and initialized? I couldn't find it in the linux source. (using qemu, I see this value is seen to be 1 and incremented to 2 after the __preempt_count_add. Of course it is decrement back to 1 after the percpu variable access.)


Solution

this_cpu_read(printk_context) expands to:
__pcpu_size_call_return(this_cpu_read_, printk_context)

({
    typeof(printk_context) pscr_ret__;
    __verify_pcpu_ptr(&(printk_context));
    switch(sizeof(printk_context)) {
    case 1: pscr_ret__ = this_cpu_read_1(printk_context); break;
    case 2: pscr_ret__ = this_cpu_read_2(printk_context); break;
    case 4: pscr_ret__ = this_cpu_read_4(printk_context); break;
    case 8: pscr_ret__ = this_cpu_read_8(printk_context); break;
    default:
        __bad_size_call_parameter(); break;
    }
    pscr_ret__;
})

sizeof(printk_context) is 4, so pscr_ret__ = this_cpu_read_4(printk_context);.

The this_cpu_read_4() macro is defined by #include <asm/percpu.h>:

==== arch/arm64/include/asm/percpu.h ====

#define this_cpu_read_4(pcp)        \
    _pcp_protect_return(__percpu_read_32, pcp)
#define _pcp_protect_return(op, pcp, args...)               \
({                                  \
    typeof(pcp) __retval;                       \
    preempt_disable_notrace();                  \
    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
    preempt_enable_notrace();                   \
    __retval;                           \
})

That is where the preempt count manipulation occurs.

The preempt_disable_notrace() and preempt_enable_notrace() macros are defined by #include <linux/preempt.h>.

==== include/linux/preempt.h ====

#define preempt_enable_notrace() \
do { \
    barrier(); \
    __preempt_count_dec(); \
} while (0)
#define preempt_disable_notrace() \
do { \
    __preempt_count_inc(); \
    barrier(); \
} while (0)
#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

__preempt_count_add() and __preempt_count_sub() are defined by #include <asm/preempt.h>.

==== arch/arm64/include/asm/preempt.h ====

static inline void __preempt_count_add(int val)
{
    u32 pc = READ_ONCE(current_thread_info()->preempt.count);
    pc += val;
    WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

static inline void __preempt_count_sub(int val)
{
    u32 pc = READ_ONCE(current_thread_info()->preempt.count);
    pc -= val;
    WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

For arm64, CONFIG_THREAD_INFO_IN_TASK is enabled so current_thread_info() is defined as a macro by #include <linux/thread_info.h>.

==== include/linux/thread_info.h ====

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

The current macro is defined by #include <asm/current.h>.

==== arch/arm64/include/asm/current.h ====

#define current get_current()
/*
 * We don't use read_sysreg() as we want the compiler to cache the value where
 * possible.
 */
static __always_inline struct task_struct *get_current(void)
{
    unsigned long sp_el0;

    asm ("mrs %0, sp_el0" : "=r" (sp_el0));

    return (struct task_struct *)sp_el0;
}

There is some magic in arch/arm64/kernel/entry.S relating to the use of the sp_el0 stack pointer to point to the current thread_info / task_struct. Sorry, I do not have time to study the gory details, but it was introduced by commit 6cdf9c7ca687 ("arm64: Store struct thread_info in sp_el0").

The key thing is that sp_el0 register is not the same as sp. The kernel does not run in EL0 mode so sp_el0 is available as a "scratch" register. The kernel uses it to point to the current thread_info / task_struct.

struct task_struct is defined by #include <linux/sched.h>.

==== include/linux/sched.h ====

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /*
     * For reasons of header soup (see current_thread_info()), this
     * must be the first element of task_struct.
     */
    struct thread_info      thread_info;
#endif
    /* -1 unrunnable, 0 runnable, >0 stopped: */
    volatile long           state;

Since CONFIG_THREAD_INFO_IN_TASK is selected, the first member is struct thread_info thread_info. current_thread_info() points to that member in the current task.

struct thread_info is defined by #include <asm/thread_info.h>.

==== arch/arm64/include/asm/thread_info.h ====

/*
 * low level task data that entry.S needs immediate access to.
 */
struct thread_info {
    unsigned long       flags;      /* low level flags */
    mm_segment_t        addr_limit; /* address limit */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
    u64         ttbr0;      /* saved TTBR0_EL1 */
#endif
    union {
        u64     preempt_count;  /* 0 => preemptible, <0 => bug */
        struct {
#ifdef CONFIG_CPU_BIG_ENDIAN
            u32 need_resched;
            u32 count;
#else
            u32 count;
            u32 need_resched;
#endif
        } preempt;
    };
};

When CONFIG_ARM64_SW_TTBR0_PAN is not selected and the CPU is little-endian, the preempt.count member will be at offset 16 from the start of the structure.



Answered By - Ian Abbott
Answer Checked By - Senaida (WPSolving Volunteer)