diff options
author | Luca Dariz <luca@orpolo.org> | 2023-04-19 21:47:02 +0200 |
---|---|---|
committer | Samuel Thibault <samuel.thibault@ens-lyon.org> | 2023-05-01 02:00:28 +0200 |
commit | 660bc8ab3813737b3857648b7ec60d88494aeed1 (patch) | |
tree | 1566958542f4af9707992aa02faac3c925d08186 | |
parent | 589735c3220793d1e9423bf6ec751b4625309aac (diff) | |
download | gnumach-660bc8ab3813737b3857648b7ec60d88494aeed1.tar.gz gnumach-660bc8ab3813737b3857648b7ec60d88494aeed1.tar.bz2 gnumach-660bc8ab3813737b3857648b7ec60d88494aeed1.zip |
x86_64: add 64-bit syscall entry point
While theoretically we could still use the same call gate as for
32-bit userspace, it doesn't seem very common, and gcc seems to not
encode properly the instruction. Instead we use syscall/sysret as
other kernels (e.g. XNU,Linux). This version still has some
limitations, but should be enough to start working on the 64-bit user
space.
* i386/i386/i386asm.sym: add more constants to fill pcb->iss
* i386/i386/ldt.c: configure 64-bit syscall entry point. We can just
check for the SEP bit as MSR are always available on x86_64.
* i386/i386/ldt.h: swap CS/DS segments order if !USER32 as required by
sysret
* i386/i386/locore.h: add syscall64 prototype
* i386/i386/msr.h: add MSR definitions and C read/write helpers
* i386/include/mach/i386/syscall_sw.h: remove old BSD_TRAP
* x86_64/Makefrag.am: selectively install syscall_sw.h depending on
USER32
* x86_64/include/syscall_sw.h: add entry point template from user
space
* x86_64/locore.S: implement syscall64 entry point and use it when a
64-bit user-space is configured
Message-Id: <20230419194703.410575-4-luca@orpolo.org>
-rw-r--r-- | i386/i386/i386asm.sym | 15 | ||||
-rw-r--r-- | i386/i386/ldt.c | 16 | ||||
-rw-r--r-- | i386/i386/ldt.h | 9 | ||||
-rw-r--r-- | i386/i386/locore.h | 1 | ||||
-rw-r--r-- | i386/i386/msr.h | 56 | ||||
-rw-r--r-- | i386/include/mach/i386/syscall_sw.h | 12 | ||||
-rw-r--r-- | x86_64/Makefrag.am | 7 | ||||
-rw-r--r-- | x86_64/include/syscall_sw.h | 40 | ||||
-rw-r--r-- | x86_64/locore.S | 158 |
9 files changed, 294 insertions, 20 deletions
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym index 8317db6c..1b9b40bb 100644 --- a/i386/i386/i386asm.sym +++ b/i386/i386/i386asm.sym @@ -52,6 +52,8 @@ expr CALL_SINGLE_FUNCTION_BASE offset ApicLocalUnit lu apic_id APIC_ID +offset pcb pcb iss + offset thread th pcb offset thread th task offset thread th recover @@ -82,16 +84,29 @@ size i386_kernel_state iks size i386_exception_link iel +offset i386_saved_state r gs +offset i386_saved_state r fs offset i386_saved_state r cs offset i386_saved_state r uesp offset i386_saved_state r eax +offset i386_saved_state r ebx +offset i386_saved_state r ecx +offset i386_saved_state r edx +offset i386_saved_state r ebp offset i386_saved_state r trapno offset i386_saved_state r err offset i386_saved_state r efl R_EFLAGS offset i386_saved_state r eip offset i386_saved_state r cr2 offset i386_saved_state r edi +offset i386_saved_state r esi #ifdef __x86_64__ +offset i386_saved_state r r8 +offset i386_saved_state r r9 +offset i386_saved_state r r10 +offset i386_saved_state r r12 +offset i386_saved_state r r13 +offset i386_saved_state r r14 offset i386_saved_state r r15 #endif diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c index b86a0e3c..4d7ec19a 100644 --- a/i386/i386/ldt.c +++ b/i386/i386/ldt.c @@ -31,6 +31,7 @@ #include <mach/xen.h> #include <intel/pmap.h> +#include <kern/debug.h> #include "vm_param.h" #include "seg.h" @@ -38,6 +39,7 @@ #include "ldt.h" #include "locore.h" #include "mp_desc.h" +#include "msr.h" #ifdef MACH_PV_DESCRIPTORS /* It is actually defined in xen_boothdr.S */ @@ -65,10 +67,22 @@ ldt_fill(struct real_descriptor *myldt, struct real_descriptor *mygdt) ACC_PL_K|ACC_LDT, 0); #endif /* MACH_PV_DESCRIPTORS */ - /* Initialize the 32bit LDT descriptors. */ + /* Initialize the syscall entry point */ +#if defined(__x86_64__) && ! defined(USER32) + if (!CPU_HAS_FEATURE(CPU_FEATURE_SEP)) + panic("syscall support is missing on 64 bit"); + /* Enable 64-bit syscalls */ + wrmsr(MSR_REG_EFER, rdmsr(MSR_REG_EFER) | MSR_EFER_SCE); + wrmsr(MSR_REG_LSTAR, (vm_offset_t)syscall64); + wrmsr(MSR_REG_STAR, ((((long)USER_CS - 16) << 16) | (long)KERNEL_CS) << 32); + wrmsr(MSR_REG_FMASK, 0); // ? +#else /* defined(__x86_64__) && ! defined(USER32) */ fill_ldt_gate(myldt, USER_SCALL, (vm_offset_t)&syscall, KERNEL_CS, ACC_PL_U|ACC_CALL_GATE, 0); +#endif /* defined(__x86_64__) && ! defined(USER32) */ + + /* Initialize the 32bit LDT descriptors. */ fill_ldt_descriptor(myldt, USER_CS, VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS-VM_MIN_USER_ADDRESS-4096, diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h index b15f11a5..51867f47 100644 --- a/i386/i386/ldt.h +++ b/i386/i386/ldt.h @@ -43,11 +43,16 @@ * User descriptors for Mach - 32-bit flat address space */ #define USER_SCALL 0x07 /* system call gate */ -#ifdef __x86_64__ +#if defined(__x86_64__) && ! defined(USER32) /* Call gate needs two entries */ -#endif + +/* The sysret instruction puts some constraints on the user segment indexes */ +#define USER_CS 0x1f /* user code segment */ +#define USER_DS 0x17 /* user data segment */ +#else #define USER_CS 0x17 /* user code segment */ #define USER_DS 0x1f /* user data segment */ +#endif #define LDTSZ 4 diff --git a/i386/i386/locore.h b/i386/i386/locore.h index a8807dbf..4388ea28 100644 --- a/i386/i386/locore.h +++ b/i386/i386/locore.h @@ -57,6 +57,7 @@ extern int inst_fetch (int eip, int cs); extern void cpu_shutdown (void); extern int syscall (void); +extern int syscall64 (void); extern unsigned int cpu_features[2]; diff --git a/i386/i386/msr.h b/i386/i386/msr.h new file mode 100644 index 00000000..8f09b80b --- /dev/null +++ b/i386/i386/msr.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2023 Free Software Foundation + * + * This program is free software ; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation ; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY ; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with the program ; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _MACHINE_MSR_H_ +#define _MACHINE_MSR_H_ + +#define MSR_REG_EFER 0xC0000080 +#define MSR_REG_STAR 0xC0000081 +#define MSR_REG_LSTAR 0xC0000082 +#define MSR_REG_CSTAR 0xC0000083 +#define MSR_REG_FMASK 0xC0000084 +#define MSR_REG_FSBASE 0xC0000100 +#define MSR_REG_GSBASE 0xC0000101 + +#define MSR_EFER_SCE 0x00000001 + +#ifndef __ASSEMBLER__ + +static inline void wrmsr(uint32_t regaddr, uint64_t value) +{ + uint32_t low = (uint32_t) value, high = ((uint32_t) (value >> 32)); + asm volatile("wrmsr" + : + : "c" (regaddr), "a" (low), "d" (high) + : "memory" /* wrmsr may cause a read from memory, so + * make the compiler flush any changes */ + ); +} + +static inline uint64_t rdmsr(uint32_t regaddr) +{ + uint32_t low, high; + asm volatile("rdmsr" + : "=a" (low), "=d" (high) + : "c" (regaddr) + ); + return ((uint64_t)high << 32) | low; +} +#endif /* __ASSEMBLER__ */ + +#endif /* _MACHINE_MSR_H_ */ diff --git a/i386/include/mach/i386/syscall_sw.h b/i386/include/mach/i386/syscall_sw.h index 86f6ff2f..9eeb2939 100644 --- a/i386/include/mach/i386/syscall_sw.h +++ b/i386/include/mach/i386/syscall_sw.h @@ -29,21 +29,11 @@ #include <mach/machine/asm.h> -#if BSD_TRAP -#define kernel_trap(trap_name,trap_number,number_args) \ +#define kernel_trap(trap_name,trap_number,number_args) \ ENTRY(trap_name) \ movl $ trap_number,%eax; \ SVC; \ - jb LCL(cerror); \ ret; \ END(trap_name) -#else -#define kernel_trap(trap_name,trap_number,number_args) \ -ENTRY(trap_name) \ - movl $ trap_number,%eax; \ - SVC; \ - ret; \ -END(trap_name) -#endif #endif /* _MACH_I386_SYSCALL_SW_H_ */ diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am index d3735890..fb225aa5 100644 --- a/x86_64/Makefrag.am +++ b/x86_64/Makefrag.am @@ -175,11 +175,16 @@ include_mach_x86_64_HEADERS = \ i386/include/mach/i386/mach_i386_types.h \ i386/include/mach/i386/machine_types.defs \ i386/include/mach/i386/multiboot.h \ - i386/include/mach/i386/syscall_sw.h \ i386/include/mach/i386/thread_status.h \ i386/include/mach/i386/trap.h \ i386/include/mach/i386/vm_param.h \ i386/include/mach/i386/vm_types.h + +if enable_user32 +include_mach_x86_64_HEADERS += i386/include/mach/i386/syscall_sw.h +else +include_mach_x86_64_HEADERS += x86_64/include/syscall_sw.h +endif # # Platform specific parts. diff --git a/x86_64/include/syscall_sw.h b/x86_64/include/syscall_sw.h new file mode 100644 index 00000000..4e03f28c --- /dev/null +++ b/x86_64/include/syscall_sw.h @@ -0,0 +1,40 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#ifndef _MACH_X86_64_SYSCALL_SW_H_ +#define _MACH_X86_64_SYSCALL_SW_H_ + +#include <mach/machine/asm.h> + +#define kernel_trap(trap_name,trap_number,number_args) \ +ENTRY(trap_name) \ + movq $ trap_number,%rax; \ + movq %rcx,%r10; \ + syscall; \ + ret; \ +END(trap_name) + +#endif /* _MACH_X86_64_SYSCALL_SW_H_ */ diff --git a/x86_64/locore.S b/x86_64/locore.S index bffdea63..0d7cdd0e 100644 --- a/x86_64/locore.S +++ b/x86_64/locore.S @@ -423,13 +423,17 @@ ENTRY(t_debug) /* Note: handling KERNEL_RING value by hand */ testq $2,8(%rsp) /* is trap from kernel mode? */ jnz 0f /* if so: */ +#ifdef USER32 cmpq $syscall_entry,(%rsp) /* system call entry? */ jne 0f /* if so: */ /* flags are sitting where syscall */ /* wants them */ addq $32,%rsp /* remove eip/cs */ jmp syscall_entry_2 /* continue system call entry */ - +#else + // TODO: implement the 64-bit case + ud2 +#endif 0: pushq $0 /* otherwise: */ pushq $(T_DEBUG) /* handle as normal */ jmp EXT(alltraps) /* debug fault */ @@ -497,12 +501,12 @@ trap_from_user: _take_trap: movq %rbx,%rdi /* pass register save area to trap */ call EXT(user_trap) /* call user trap routine */ - +#ifdef USER32 orq %rax,%rax /* emulated syscall? */ jz 1f /* no, just return */ movq R_EAX(%rbx),%rax /* yes, get syscall number */ jmp syscall_entry_3 /* and emulate it */ - +#endif 1: movq (%rsp),%rsp /* switch back to PCB stack */ @@ -1055,6 +1059,7 @@ ud2 #endif /* MACH_TTD */ +#ifdef USER32 /* * System call enters through a call gate. Flags are not saved - * we must shuffle stack to look like trap save area. @@ -1269,7 +1274,152 @@ syscall_addr: movq $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */ jmp _take_trap /* treat as a trap */ +END(syscall) + +#else /* USER32 */ + +/* Entry point for 64-bit syscalls. + * On entry we're still on the user stack, so better not use it. Instead we + * save the thread state immediately in thread->pcb->iss, then try to invoke + * the syscall. + * Note: emulated syscalls seem to not be used anymore in GNU/Hurd, so they + * are not handled here. + * TODO: + - for now we assume the return address is canonical, but apparently there + can be cases where it's not (see how Linux handles this). Does it apply + here? + - do we need to check for ast on syscalls? Maybe on interrupts is enough + - check that the case where a task is suspended, and later returns via + iretq from return_from_trap, works fine in all combinations + */ +ENTRY(syscall64) + /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and + * eflags in RAX to allow using r11 as temporary register + */ + shlq $32,%r11 + shlq $32,%rax /* make sure bits 32:63 of %rax are zero */ + shrq $32,%rax + or %r11,%rax + + /* Save thread state in pcb->iss, as on exception entry. + * Since this is triggered synchronously from userspace, we could + * save only the callee-preserved status according to the C ABI, + * plus RIP and EFLAGS for sysret + */ + CPU_NUMBER(%r11) + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */ + movq TH_PCB(%r11),%r11 /* point to pcb */ + addq $ PCB_ISS,%r11 /* point to saved state */ + + mov %rsp,R_UESP(%r11) /* callee-preserved register */ + mov %rcx,R_EIP(%r11) /* syscall places user RIP in RCX */ + mov %rbx,R_EBX(%r11) /* callee-preserved register */ + mov %rax,%rbx /* Now we can unpack eflags again */ + shr $32,%rbx + mov %rbx,R_EFLAGS(%r11) /* ... and save them in pcb as well */ + mov %rbp,R_EBP(%r11) /* callee-preserved register */ + mov %r12,R_R12(%r11) /* callee-preserved register */ + mov %r13,R_R13(%r11) /* callee-preserved register */ + mov %r14,R_R14(%r11) /* callee-preserved register */ + mov %r15,R_R15(%r11) /* callee-preserved register */ + + /* Save syscall number and args for SYSCALL_EXAMINE/MSG_EXAMINE in glibc. + * Note: syscall number is only 32 bit, in EAX, so we sign-extend it in + * RAX to mask the EFLAGS bits. + */ + cdqe /* sign-extend EAX in RAX */ + mov %rax,R_EAX(%r11) /* syscall number */ + mov %rdi,R_EDI(%r11) /* syscall arg0 */ + mov %rsi,R_ESI(%r11) /* syscall arg1 */ + mov %rdx,R_EDX(%r11) /* syscall arg2 */ + mov %r10,R_R10(%r11) /* syscall arg3 */ + mov %r8,R_R8(%r11) /* syscall arg4 */ + mov %r9,R_R9(%r11) /* syscall arg5 */ + + mov %r11,%rbx /* prepare for error handling */ + mov %r10,%rcx /* fix arg3 location according to C ABI */ + + /* switch to kernel stack */ + CPU_NUMBER(%r11) + movq CX(EXT(kernel_stack),%r11),%rsp + + /* Now we have saved state and args 1-6 are in place. + * Before invoking the syscall we do some bound checking and, + * if we have more that 6 arguments, we need to copy the + * remaining ones to the kernel stack, handling page faults when + * accessing the user stack. + */ + negl %eax /* get system call number */ + jl _syscall64_range /* out of range if it was positive */ + cmpl EXT(mach_trap_count),%eax /* check system call table bounds */ + jg _syscall64_range /* error if out of range */ + shll $5,%eax /* manual indexing of mach_trap_t */ + + /* check if we need to place some arguments on the stack */ +_syscall64_args_stack: + mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */ + subq $6,%r10 /* the first 6 args are already in place */ + jle _syscall64_call /* skip argument copy if num args <= 6 */ + + movq R_UESP(%rbx),%r11 /* get user stack pointer */ + addq $8,%r11 /* Skip user return address */ + lea (%r11,%r10,8),%r11 /* point past last argument */ + +0: subq $8,%r11 + RECOVER(_syscall64_addr_push) + mov (%r11),%r12 + pushq %r12 /* push argument on stack */ + dec %r10 + jnz 0b /* loop for all remaining arguments */ + +_syscall64_call: + call *EXT(mach_trap_table)+8(%rax) /* call procedure */ + // XXX: check ast on exit? + + /* Restore thread state and return to user using sysret. */ + CPU_NUMBER(%r11) + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */ + movq TH_PCB(%r11),%r11 /* point to pcb */ + addq $ PCB_ISS,%r11 /* point to saved state */ + + /* Restore syscall args. Note: we can't restore the syscall number in + * RAX because it needs to hold the return value.*/ + mov R_EDI(%r11),%rdi /* syscall arg0 */ + mov R_ESI(%r11),%rsi /* syscall arg1 */ + mov R_EDX(%r11),%rdx /* syscall arg2 */ + mov R_R10(%r11),%r10 /* syscall arg3 */ + mov R_R8(%r11),%r8 /* syscall arg4 */ + mov R_R9(%r11),%r9 /* syscall arg5 */ + + mov R_UESP(%r11),%rsp /* callee-preserved register, + * also switch back to user stack */ + mov R_EIP(%r11),%rcx /* sysret convention */ + mov R_EBX(%r11),%rbx /* callee-preserved register */ + mov R_EBP(%r11),%rbp /* callee-preserved register */ + mov R_R12(%r11),%r12 /* callee-preserved register */ + mov R_R13(%r11),%r13 /* callee-preserved register */ + mov R_R14(%r11),%r14 /* callee-preserved register */ + mov R_R15(%r11),%r15 /* callee-preserved register */ + mov R_EFLAGS(%r11),%r11 /* sysret convention */ + + sysretq /* fast return to user-space, the thread didn't block */ + +/* Error handling fragments, from here we jump directly to the trap handler */ +_syscall64_addr_push: + movq %r11,R_CR2(%rbx) /* set fault address */ + movq $(T_PAGE_FAULT),R_TRAPNO(%rbx) /* set page-fault trap */ + movq $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */ + jmp _take_trap /* treat as a trap */ + +_syscall64_range: + movq $(T_INVALID_OPCODE),R_TRAPNO(%rbx) + /* set invalid-operation trap */ + movq $0,R_ERR(%rbx) /* clear error code */ + jmp _take_trap /* treat as a trap */ + +END(syscall64) +#endif /* USER32 */ .data DATA(cpu_features) @@ -1279,8 +1429,6 @@ DATA(cpu_features_ecx) .long 0 .text -END(syscall) - /* Discover what kind of cpu we have; return the family number (3, 4, 5, 6, for 386, 486, 586, 686 respectively). */ ENTRY(discover_x86_cpu_type) |