swapper_pg_dir页表

四 23 五月 2024 | tags: Linux

页表分配

这个页表的L0的page table是在Linux kernel中的link script 中分配:

swapper_pg_dir = .;
. += PAGE_SIZE;

这里只分配了L0的页表,其他的页表要不使用memblock来分配,要不使用正常的逻辑来分配。 除了FIXADDR的页表是静态分配的。

内存布局

根据内核文档Documentation/arch/arm64/memory.rst:

AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit)::

  Start                 End                     Size            Use
  -----------------------------------------------------------------------
  0000000000000000      0000ffffffffffff         256TB          user
  ffff000000000000      ffff7fffffffffff         128TB          kernel logical memory map
 [ffff600000000000      ffff7fffffffffff]         32TB          [kasan shadow region]
  ffff800000000000      ffff80007fffffff           2GB          modules
  ffff800080000000      fffffbffefffffff         124TB          vmalloc
  fffffbfff0000000      fffffbfffdffffff         224MB          fixed mappings (top down)
  fffffbfffe000000      fffffbfffe7fffff           8MB          [guard region]
  fffffbfffe800000      fffffbffff7fffff          16MB          PCI I/O space
  fffffbffff800000      fffffbffffffffff           8MB          [guard region]
  fffffc0000000000      fffffdffffffffff           2TB          vmemmap
  fffffe0000000000      ffffffffffffffff           2TB          [guard region]

这里面对应的有几个重要的定义所对应的值:

PAGE_OFFSET = 0xffff000000000000
VMEMMAP_START = 0xfffffc0000000000
VMALLOC_START = MODULES_END = 0xffff800080000000
VMALLOC_END = (VMEMMAP_START - SZ_256M)
FIXADDR_TOP = (VMEMMAP_START - SZ_32M) = 0xfffffbfff0000000
KIMAGE_VADDR = MODULES_END = 0xffff800080000000

kernel image 页表

在使能swapper_pg_dir页表之前,所有的kernel映射都是都没有区分的很细致,都是使用的是2M的页表,映射成全都可以执行的。进入到swapper_pg_dir页表阶段,这个时候我们就要对它进行细分。

可以看到KIMAGE_VADDR 是落在VMALLOC区域,同时我们可以看到在函数 setup_arch->paging_init->map_kernel 之后的结果也可以看出KIMAGE_VADDR是在vmalloc区域:

static void __init map_kernel(pgd_t *pgdp)
{
        /....
        map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0,
                           VM_NO_GUARD);
        map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
                           &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
        map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
                           &vmlinux_inittext, 0, VM_NO_GUARD);
        map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
                           &vmlinux_initdata, 0, VM_NO_GUARD);
        map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);

        fixmap_copy(pgdp);
        kasan_copy_shadow(pgdp);
}
>mmu memory-map EL1N_S1 TTBR1_EL1=0x859A5000
Virtual Range                              | Physical Range                           | Type   | AP | C    | S    | X
-------------------------------------------------------------------------------------------------------------------------
EL1N:0x0000000000000000-0x0000FFFFFFFFFFFF | <unmapped>                               |        |    |      |      |
EL1N:0xFFFF000000000000-0xFFFF80000800FFFF | <unmapped>                               |        |    |      |      |
EL1N:0xFFFF800008010000-0xFFFF80000902FFFF | NP:0x0000000084010000-0x000000008502FFFF | Normal | RO | True | True | True
EL1N:0xFFFF800009030000-0xFFFF8000099AFFFF | NP:0x0000000085030000-0x00000000859AFFFF | Normal | RW | True | True | False
EL1N:0xFFFF8000099B0000-0xFFFF800009A8FFFF | NP:0x00000000859B0000-0x0000000085A8FFFF | Normal | RO | True | True | True
EL1N:0xFFFF800009A90000-0xFFFF80000A5DFFFF | NP:0x0000000085A90000-0x00000000865DFFFF | Normal | RW | True | True | False
EL1N:0xFFFF80000A5E0000-0xFFFFFFFFFFFFFFFF | <unmapped>                               |        |    |      |      |

当运行fixmap_copy(pgdp),其实就是把之前建的那些表跟swapper_pg_dir 给对接上,运行完成之后可以看到device tree 和串口的映射就有了:

>mmu memory-map EL1N_S1 TTBR1_EL1=0x859A5000
Virtual Range                              | Physical Range                           | Type         | AP | C     | S     | X
---------------------------------------------------------------------------------------------------------------------------------
EL1N:0x0000000000000000-0x0000FFFFFFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF000000000000-0xFFFF80000800FFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF800008010000-0xFFFF80000902FFFF | NP:0x0000000084010000-0x000000008502FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009030000-0xFFFF8000099AFFFF | NP:0x0000000085030000-0x00000000859AFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF8000099B0000-0xFFFF800009A8FFFF | NP:0x00000000859B0000-0x0000000085A8FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009A90000-0xFFFF80000A5DFFFF | NP:0x0000000085A90000-0x00000000865DFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF80000A5E0000-0xFFFFFBFFFDA33FFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFBFFFDA34000-0xFFFFFBFFFDA34FFF | NP:0x00000000859A5000-0x00000000859A5FFF | Normal       | RW | True  | True  | False
EL1N:0xFFFFFBFFFDA35000-0xFFFFFBFFFDBFEFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFBFFFDBFF000-0xFFFFFBFFFDBFFFFF | NP:0x000000001C090000-0x000000001C090FFF | Device-nGnRE | RW | False | False | False
EL1N:0xFFFFFBFFFDC00000-0xFFFFFBFFFDDFFFFF | NP:0x0000000082000000-0x00000000821FFFFF | Normal       | RO | True  | True  | False
EL1N:0xFFFFFBFFFDE00000-0xFFFFFFFFFFFFFFFF | <unmapped>                               |              |    |       |       |

线性地址映射

在运行到paging_init 的时候,系统已经收集到了系统可用的memblock信息,这个时候在函数 paging_init->map_mem 的时候就根据memblock的信息,给系统的所有的物理地址建立线性映射。

memblock信息是根据device tree来建立的,memblock的信息如下:

# cat /sys/kernel/debug/memblock/memory
   0: 0x0000000080000000..0x00000000fbffffff
   1: 0x0000000880000000..0x00000008ffffffff

对于kernel image本身这块所占用的内存,必须把他设置在这个线性映射上不可以执行并且不能写。 在 init_idmap_pg_dir 有提到一部分是需要被回收的, 所以这里也就为什么是kernel_end == __init_begin。

static void __init map_mem(pgd_t *pgdp)
{
        static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
        phys_addr_t kernel_start = __pa_symbol(_stext);
        phys_addr_t kernel_end = __pa_symbol(__init_begin);

        //....
        /*
         * Take care not to create a writable alias for the
         * read-only text and rodata sections of the kernel image.
         * So temporarily mark them as NOMAP to skip mappings in
         * the following for-loop
         */
        memblock_mark_nomap(kernel_start, kernel_end - kernel_start);

        /* map all the memory banks */
        for_each_mem_range(i, &start, &end) {
                if (start >= end)
                        break;
                /*
                 * The linear map must allow allocation tags reading/writing
                 * if MTE is present. Otherwise, it has the same attributes as
                 * PAGE_KERNEL.
                 */
                __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
                               flags);
        }

        /*
         * Map the linear alias of the [_stext, __init_begin) interval
         * as non-executable now, and remove the write permission in
         * mark_linear_text_alias_ro() below (which will be called after
         * alternative patching has completed). This makes the contents
         * of the region accessible to subsystems such as hibernate,
         * but protects it from inadvertent modification or execution.
         * Note that contiguous mappings cannot be remapped in this way,
         * so we should avoid them here.
         */
        __map_memblock(pgdp, kernel_start, kernel_end,
                       PAGE_KERNEL, NO_CONT_MAPPINGS);
        memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
        //...

运行完成这个函数之后,可以看到对应的memory-map已经修改成如下,不过kernel对应的线性map还是可以写的,后面会修改成只读的,可以参考上面的注释。

>mmu memory-map EL1N_S1 TTBR1_EL1=0x859A5000
Virtual Range                              | Physical Range                           | Type         | AP | C     | S     | X
---------------------------------------------------------------------------------------------------------------------------------
EL1N:0x0000000000000000-0x0000FFFFFFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF000000000000-0xFFFF00000400FFFF | NP:0x0000000080000000-0x000000008400FFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF000004010000-0xFFFF0000059AFFFF | NP:0x0000000084010000-0x00000000859AFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF0000059B0000-0xFFFF00007BFFFFFF | NP:0x00000000859B0000-0x00000000FBFFFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF00007C000000-0xFFFF0007FFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF000800000000-0xFFFF00087FFFFFFF | NP:0x0000000880000000-0x00000008FFFFFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF000880000000-0xFFFF80000800FFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF800008010000-0xFFFF80000902FFFF | NP:0x0000000084010000-0x000000008502FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009030000-0xFFFF8000099AFFFF | NP:0x0000000085030000-0x00000000859AFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF8000099B0000-0xFFFF800009A8FFFF | NP:0x00000000859B0000-0x0000000085A8FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009A90000-0xFFFF80000A5DFFFF | NP:0x0000000085A90000-0x00000000865DFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF80000A5E0000-0xFFFFFBFFFDBFEFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFBFFFDBFF000-0xFFFFFBFFFDBFFFFF | NP:0x000000001C090000-0x000000001C090FFF | Device-nGnRE | RW | False | False | False
EL1N:0xFFFFFBFFFDC00000-0xFFFFFBFFFDDFFFFF | NP:0x0000000082000000-0x00000000821FFFFF | Normal       | RO | True  | True  | False
EL1N:0xFFFFFBFFFDE00000-0xFFFFFFFFFFFFFFFF | <unmapped>                               |              |    |       |       |

线性映射一般的系统到这里就结束了,就不会再改变了,但是像NUMA的系统或者内存可以热插拔的系统,这里应该是还会有一些变化的。 ..

TODO: arch_add_memory numa_add_memblk

使能页表swapper_pg_dir

因为这个时候所有kernel image 的映射都已经在swapper_pg_dir 里面都准备就绪,这个时候就可以从init_pg_dir 切换到swapper_pg_dir页表,调用过程: paging_init -> __cpu_replace_ttbr1 -> idmap_cpu_replace_ttbr1

在函数__cpu_replace_ttbr1 中,这里需要切换TTBR1. 但是当前kernel代码使用的映射是通过页表TTBR1=init_pg_dir转换而来,但是在切换的过程中,这里TTBR1要被替换,那就不能直接当前的代码来切换这个TTBR1,需要用到之前1:1的映射。 而在setup_arch->cpu_uninstall_idmap的时候init_idmap_pg_dir这个页表已经从TTBR0里面给删除了。 这也就是__cpu_replace_ttbr1 函数需要重新安装idmap的原因,使用idmap_cpu_replace_ttbr1 1:1 的VA来,也就是PA来执行这个函数:

static inline void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
{
        //...

        /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
        phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));

        if (cnp)
                ttbr1 |= TTBR_CNP_BIT;

        replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);

        __cpu_install_idmap(idmap);

        /*
         * We really don't want to take *any* exceptions while TTBR1 is
         * in the process of being replaced so mask everything.
         */
        daif = local_daif_save();
        replace_phys(ttbr1);
        local_daif_restore(daif);

        cpu_uninstall_idmap();
}

设置这个TTBR1页表的时候,同样遵循Break before make的原则,还需要把TLB给invalid一下:

        offset_ttbr1 x0, x3
        msr     ttbr1_el1, x0
        isb

        ret
SYM_FUNC_END(idmap_cpu_replace_ttbr1)

.macro  __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2
        adrp    \tmp1, reserved_pg_dir
        phys_to_ttbr \tmp2, \tmp1
        offset_ttbr1 \tmp2, \tmp1
        msr     ttbr1_el1, \tmp2
        isb
        tlbi    vmalle1
        dsb     nsh
        isb
.endm

执行完上面的代码,接下来kernel 就一直使用页表swapper_pg_dir,init_pg_dir 就可以被释放。

vmemmap区域映射

vmemmap区域是一块起始地址是VMEMMAP_START,范围是2TB的虚拟地址区域。以section为单位来存放strcut page结构的虚拟地址空间,然后每个page再线性映射到物理内存。 系统在执行完 setup->arch-> bootmem_init -> sparse_init 之后,可以看到vmemmap 已经增加了映射:

>mmu memory-map EL1N_S1 TTBR1_EL1=0x859A5000
Virtual Range                              | Physical Range                           | Type         | AP | C     | S     | X
---------------------------------------------------------------------------------------------------------------------------------
EL1N:0x0000000000000000-0x0000FFFFFFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF000000000000-0xFFFF00000400FFFF | NP:0x0000000080000000-0x000000008400FFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF000004010000-0xFFFF0000059AFFFF | NP:0x0000000084010000-0x00000000859AFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF0000059B0000-0xFFFF00007BFFFFFF | NP:0x00000000859B0000-0x00000000FBFFFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF00007C000000-0xFFFF0007FFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF000800000000-0xFFFF00087FFFFFFF | NP:0x0000000880000000-0x00000008FFFFFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF000880000000-0xFFFF80000800FFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFF800008010000-0xFFFF80000902FFFF | NP:0x0000000084010000-0x000000008502FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009030000-0xFFFF8000099AFFFF | NP:0x0000000085030000-0x00000000859AFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF8000099B0000-0xFFFF800009A8FFFF | NP:0x00000000859B0000-0x0000000085A8FFFF | Normal       | RO | True  | True  | True
EL1N:0xFFFF800009A90000-0xFFFF80000A5DFFFF | NP:0x0000000085A90000-0x00000000865DFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFF80000A5E0000-0xFFFFFBFFFDBFEFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFBFFFDBFF000-0xFFFFFBFFFDBFFFFF | NP:0x000000001C090000-0x000000001C090FFF | Device-nGnRE | RW | False | False | False
EL1N:0xFFFFFBFFFDC00000-0xFFFFFBFFFDDFFFFF | NP:0x0000000082000000-0x00000000821FFFFF | Normal       | RO | True  | True  | False
EL1N:0xFFFFFBFFFDE00000-0xFFFFFBFFFFFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFC0000000000-0xFFFFFC0001FFFFFF | NP:0x00000008FB600000-0x00000008FD5FFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFFFC0002000000-0xFFFFFC001FFFFFFF | <unmapped>                               |              |    |       |       |
EL1N:0xFFFFFC0020000000-0xFFFFFC0021FFFFFF | NP:0x00000008FD600000-0x00000008FF5FFFFF | Normal       | RW | True  | True  | False
EL1N:0xFFFFFC0022000000-0xFFFFFFFFFFFFFFFF | <unmapped>                               |              |    |       |       |

第二个CPU启动

在第二个CPU启动的时候,直接把这个页表给到CPU然后切换到kernel的VA继续执行:

SYM_FUNC_START_LOCAL(secondary_startup)
        /*
         * Common entry point for secondary CPUs.
         */
        mov     x20, x0                         // preserve boot mode
        bl      __cpu_secondary_check52bitva
#if VA_BITS > 48
        ldr_l   x0, vabits_actual
#endif
        bl      __cpu_setup                     // initialise processor
        adrp    x1, swapper_pg_dir
        adrp    x2, idmap_pg_dir
        bl      __enable_mmu
        ldr     x8, =__secondary_switched
        br      x8
SYM_FUNC_END(secondary_startup)

以及CPU从off 状态重新启动起来的时候,也需要把这个页表设置给CPU:

SYM_CODE_START(cpu_resume)
        mov     x0, xzr
        bl      init_kernel_el
        mov     x19, x0                 // preserve boot mode
#if VA_BITS > 48
        ldr_l   x0, vabits_actual
#endif
        bl      __cpu_setup
        /* enable the MMU early - so we can access sleep_save_stash by va */
        adrp    x1, swapper_pg_dir
        adrp    x2, idmap_pg_dir
        bl      __enable_mmu
        ldr     x8, =_cpu_resume
        br      x8
SYM_CODE_END(cpu_resume)

Comments !