diff options
Diffstat (limited to 'arch/s390/boot')
-rw-r--r-- | arch/s390/boot/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/boot/boot.h | 6 | ||||
-rw-r--r-- | arch/s390/boot/startup.c | 45 | ||||
-rw-r--r-- | arch/s390/boot/vmem.c | 254 |
4 files changed, 298 insertions, 9 deletions
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index d52c3e2e16bc..47a397da0498 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -35,7 +35,7 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o vmem.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 286441cf3bf0..547614496e7e 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -16,7 +16,7 @@ struct machine_info { struct vmlinux_info { unsigned long default_lma; - void (*entry)(void); + unsigned long entry; unsigned long image_size; /* does not include .bss */ unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; @@ -27,6 +27,9 @@ struct vmlinux_info { unsigned long rela_dyn_start; unsigned long rela_dyn_end; unsigned long amode31_size; + unsigned long init_mm_off; + unsigned long swapper_pg_dir_off; + unsigned long invalid_pg_dir_off; }; void startup_kernel(void); @@ -41,6 +44,7 @@ void print_missing_facilities(void); void sclp_early_setup_buffer(void); void print_pgm_check_info(void); unsigned long get_random_base(unsigned long safe_addr); +void setup_vmem(unsigned long online_end, unsigned long asce_limit); void __printf(1, 2) decompressor_printk(const char *fmt, ...); void error(char *m); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index da6ee587fe9a..c5d59df9fa62 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -11,6 +11,7 @@ #include <asm/diag.h> #include <asm/uv.h> #include <asm/abs_lowcore.h> +#include <asm/mem_detect.h> #include "decompressor.h" #include "boot.h" #include "uv.h" @@ -166,9 +167,10 @@ static void setup_ident_map_size(unsigned long max_physmem_end) #endif } -static void setup_kernel_memory_layout(void) +static unsigned long setup_kernel_memory_layout(void) { unsigned long vmemmap_start; + unsigned long asce_limit; unsigned long rte_size; unsigned long pages; unsigned long vmax; @@ -183,10 +185,10 @@ static void setup_kernel_memory_layout(void) vmalloc_size > _REGION2_SIZE || vmemmap_start + vmemmap_size + vmalloc_size + MODULES_LEN > _REGION2_SIZE) { - vmax = _REGION1_SIZE; + asce_limit = _REGION1_SIZE; rte_size = _REGION2_SIZE; } else { - vmax = _REGION2_SIZE; + asce_limit = _REGION2_SIZE; rte_size = _REGION3_SIZE; } /* @@ -194,7 +196,7 @@ static void setup_kernel_memory_layout(void) * secure storage limit, so that any vmalloc allocation * we do could be used to back secure guest storage. */ - vmax = adjust_to_uv_max(vmax); + vmax = adjust_to_uv_max(asce_limit); #ifdef CONFIG_KASAN /* force vmalloc and modules below kasan shadow */ vmax = min(vmax, KASAN_SHADOW_START); @@ -223,6 +225,8 @@ static void setup_kernel_memory_layout(void) /* make sure vmemmap doesn't overlay with vmalloc area */ VMALLOC_START = max(vmemmap_start + vmemmap_size, VMALLOC_START); vmemmap = (struct page *)vmemmap_start; + + return asce_limit; } /* @@ -256,6 +260,9 @@ static void offset_vmlinux_info(unsigned long offset) vmlinux.rela_dyn_start += offset; vmlinux.rela_dyn_end += offset; vmlinux.dynsym_start += offset; + vmlinux.init_mm_off += offset; + vmlinux.swapper_pg_dir_off += offset; + vmlinux.invalid_pg_dir_off += offset; } static unsigned long reserve_amode31(unsigned long safe_addr) @@ -268,7 +275,10 @@ void startup_kernel(void) { unsigned long random_lma; unsigned long safe_addr; + unsigned long asce_limit; + unsigned long online_end; void *img; + psw_t psw; detect_facilities(); @@ -290,7 +300,8 @@ void startup_kernel(void) sanitize_prot_virt_host(); setup_ident_map_size(detect_memory()); setup_vmalloc_size(); - setup_kernel_memory_layout(); + asce_limit = setup_kernel_memory_layout(); + online_end = min(get_mem_detect_end(), ident_map_size); if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { random_lma = get_random_base(safe_addr); @@ -307,9 +318,23 @@ void startup_kernel(void) } else if (__kaslr_offset) memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + /* + * The order of the following operations is important: + * + * - handle_relocs() must follow clear_bss_section() to establish static + * memory references to data in .bss to be used by setup_vmem() + * (i.e init_mm.pgd) + * + * - setup_vmem() must follow handle_relocs() to be able using + * static memory references to data in .bss (i.e init_mm.pgd) + * + * - copy_bootdata() must follow setup_vmem() to propagate changes to + * bootdata made by setup_vmem() + */ clear_bss_section(); - copy_bootdata(); handle_relocs(__kaslr_offset); + setup_vmem(online_end, asce_limit); + copy_bootdata(); if (__kaslr_offset) { /* @@ -321,5 +346,11 @@ void startup_kernel(void) if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) memset(img, 0, vmlinux.image_size); } - vmlinux.entry(); + + /* + * Jump to the decompressed kernel entry point and switch DAT mode on. + */ + psw.addr = vmlinux.entry; + psw.mask = PSW_KERNEL_BITS; + __load_psw(psw); } diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c new file mode 100644 index 000000000000..db1469c17289 --- /dev/null +++ b/arch/s390/boot/vmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched/task.h> +#include <linux/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/mem_detect.h> +#include "decompressor.h" +#include "boot.h" + +#define init_mm (*(struct mm_struct *)vmlinux.init_mm_off) +#define swapper_pg_dir vmlinux.swapper_pg_dir_off +#define invalid_pg_dir vmlinux.invalid_pg_dir_off + +unsigned long __bootdata_preserved(s390_invalid_asce); +unsigned long __bootdata(pgalloc_pos); +unsigned long __bootdata(pgalloc_end); +unsigned long __bootdata(pgalloc_low); + +static void boot_check_oom(void) +{ + if (pgalloc_pos < pgalloc_low) + error("out of memory on boot\n"); +} + +static void pgtable_populate_begin(unsigned long online_end) +{ + unsigned long initrd_end; + unsigned long kernel_end; + + kernel_end = vmlinux.default_lma + vmlinux.image_size + vmlinux.bss_size; + pgalloc_low = round_up(kernel_end, PAGE_SIZE); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { + initrd_end = round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); + pgalloc_low = max(pgalloc_low, initrd_end); + } + + pgalloc_end = round_down(online_end, PAGE_SIZE); + pgalloc_pos = pgalloc_end; + + boot_check_oom(); +} + +static void *boot_alloc_pages(unsigned int order) +{ + unsigned long size = PAGE_SIZE << order; + + pgalloc_pos -= size; + pgalloc_pos = round_down(pgalloc_pos, size); + + boot_check_oom(); + + return (void *)pgalloc_pos; +} + +static void *boot_crst_alloc(unsigned long val) +{ + unsigned long *table; + + table = boot_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; +} + +static pte_t *boot_pte_alloc(void) +{ + static void *pte_leftover; + pte_t *pte; + + BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); + + if (!pte_leftover) { + pte_leftover = boot_alloc_pages(0); + pte = pte_leftover + _PAGE_TABLE_SIZE; + } else { + pte = pte_leftover; + pte_leftover = NULL; + } + memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + return pte; +} + +static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat2 && + IS_ALIGNED(addr, PUD_SIZE) && (end - addr) >= PUD_SIZE; +} + +static bool can_large_pmd(pmd_t *pm_dir, unsigned long addr, unsigned long end) +{ + return machine.has_edat1 && + IS_ALIGNED(addr, PMD_SIZE) && (end - addr) >= PMD_SIZE; +} + +static void pgtable_pte_populate(pmd_t *pmd, unsigned long addr, unsigned long end) +{ + unsigned long next; + pte_t *pte, entry; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (pte_none(*pte)) { + entry = __pte(__pa(addr)); + entry = set_pte_bit(entry, PAGE_KERNEL_EXEC); + set_pte(pte, entry); + } + } +} + +static void pgtable_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + pmd_t *pmd, entry; + pte_t *pte; + + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (can_large_pmd(pmd, addr, next)) { + entry = __pmd(__pa(addr)); + entry = set_pmd_bit(entry, SEGMENT_KERNEL_EXEC); + set_pmd(pmd, entry); + continue; + } + pte = boot_pte_alloc(); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + continue; + } + pgtable_pte_populate(pmd, addr, next); + } +} + +static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + pud_t *pud, entry; + pmd_t *pmd; + + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (can_large_pud(pud, addr, next)) { + entry = __pud(__pa(addr)); + entry = set_pud_bit(entry, REGION3_KERNEL_EXEC); + set_pud(pud, entry); + continue; + } + pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { + continue; + } + pgtable_pmd_populate(pud, addr, next); + } +} + +static void pgtable_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) { + pud = boot_crst_alloc(_REGION3_ENTRY_EMPTY); + p4d_populate(&init_mm, p4d, pud); + } + pgtable_pud_populate(p4d, addr, next); + } +} + +static void pgtable_populate(unsigned long addr, unsigned long end) +{ + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(&init_mm, addr); + for (; addr < end; addr = next, pgd++) { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd)) { + p4d = boot_crst_alloc(_REGION2_ENTRY_EMPTY); + pgd_populate(&init_mm, pgd, p4d); + } + pgtable_p4d_populate(pgd, addr, next); + } +} + +/* + * The pgtables are located in the range [pgalloc_pos, pgalloc_end). + * That range must stay intact and is later reserved in the memblock. + * Therefore pgtable_populate(pgalloc_pos, pgalloc_end) is needed to + * finalize pgalloc_pos pointer. However that call can decrease the + * value of pgalloc_pos pointer itself. Therefore, pgtable_populate() + * needs to be called repeatedly until pgtables are complete and + * pgalloc_pos does not grow left anymore. + */ +static void pgtable_populate_end(void) +{ + unsigned long pgalloc_end_curr = pgalloc_end; + unsigned long pgalloc_pos_prev; + + do { + pgalloc_pos_prev = pgalloc_pos; + pgtable_populate(pgalloc_pos, pgalloc_end_curr); + pgalloc_end_curr = pgalloc_pos_prev; + } while (pgalloc_pos < pgalloc_pos_prev); +} + +void setup_vmem(unsigned long online_end, unsigned long asce_limit) +{ + unsigned long asce_type; + unsigned long asce_bits; + + if (asce_limit == _REGION1_SIZE) { + asce_type = _REGION2_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + } else { + asce_type = _REGION3_ENTRY_EMPTY; + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + } + s390_invalid_asce = invalid_pg_dir | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + + crst_table_init((unsigned long *)swapper_pg_dir, asce_type); + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); + + /* + * To allow prefixing the lowcore must be mapped with 4KB pages. + * To prevent creation of a large page at address 0 first map + * the lowcore and create the identity mapping only afterwards. + * + * No further pgtable_populate() calls are allowed after the value + * of pgalloc_pos finalized with a call to pgtable_populate_end(). + */ + pgtable_populate_begin(online_end); + pgtable_populate(0, sizeof(struct lowcore)); + pgtable_populate(0, online_end); + pgtable_populate_end(); + + S390_lowcore.kernel_asce = swapper_pg_dir | asce_bits; + S390_lowcore.user_asce = s390_invalid_asce; + + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.user_asce, 7, 7); + __ctl_load(S390_lowcore.kernel_asce, 13, 13); + + init_mm.context.asce = S390_lowcore.kernel_asce; +} |