From 93d0bfbe8ffa9c3dcbfc8e953216542f500dac07 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Wed, 10 Jul 2024 15:48:49 -0400 Subject: [PATCH] nptl: Move the rseq area to the 'extra TLS' block Move the rseq area to the newly added 'extra TLS' block, this is the last step in adding support for the rseq extended ABI. The size of the rseq area is now dynamic and depends on the rseq features reported by the kernel through the elf auxiliary vector. This will allow applications to use rseq features past the 32 bytes of the original rseq ABI as they become available in future kernels. Signed-off-by: Michael Jeanson Reviewed-by: Mathieu Desnoyers Reviewed-by: Florian Weimer --- nptl/pthread_create.c | 2 +- sysdeps/nptl/dl-tls_init_tp.c | 9 -- sysdeps/unix/sysv/linux/Makefile | 10 ++ sysdeps/unix/sysv/linux/dl-parse_auxv.h | 12 +-- sysdeps/unix/sysv/linux/rseq-internal.h | 49 +++++++--- sysdeps/unix/sysv/linux/sched_getcpu.c | 3 +- .../unix/sysv/linux/tst-rseq-disable-static.c | 1 + sysdeps/unix/sysv/linux/tst-rseq-disable.c | 79 ++++++++++++--- .../unix/sysv/linux/tst-rseq-nptl-static.c | 1 + sysdeps/unix/sysv/linux/tst-rseq-static.c | 1 + sysdeps/unix/sysv/linux/tst-rseq.c | 97 ++++++++++++++++--- sysdeps/unix/sysv/linux/tst-rseq.h | 2 +- 12 files changed, 207 insertions(+), 59 deletions(-) create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-disable-static.c create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c create mode 100644 sysdeps/unix/sysv/linux/tst-rseq-static.c diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c index 01e8a86980..9ae5423df0 100644 --- a/nptl/pthread_create.c +++ b/nptl/pthread_create.c @@ -696,7 +696,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, /* Inherit rseq registration state. Without seccomp filters, rseq registration will either always fail or always succeed. */ - if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0) + if ((int) RSEQ_GETMEM_ONCE (cpu_id) >= 0) pd->flags |= ATTR_FLAG_DO_RSEQ; /* Initialize the field for the ID of the thread which is waiting diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c index 46478324fc..f487bfb66e 100644 --- a/sysdeps/nptl/dl-tls_init_tp.c +++ b/sysdeps/nptl/dl-tls_init_tp.c @@ -109,15 +109,6 @@ __tls_init_tp (void) bool do_rseq = TUNABLE_GET (rseq, int, NULL); if (!rseq_register_current_thread (pd, do_rseq)) _rseq_size = 0; - -#ifdef RSEQ_SIG - /* This should be a compile-time constant, but the current - infrastructure makes it difficult to determine its value. Not - all targets support __thread_pointer, so set __rseq_offset only - if the rseq registration may have happened because RSEQ_SIG is - defined. */ - _rseq_offset = (char *) &pd->rseq_area - (char *) __thread_pointer (); -#endif } /* Set initial thread's stack block from 0 up to __libc_stack_end. diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index 8a755293b3..c8d30cc405 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -268,6 +268,11 @@ tests-internal += \ tst-rseq-disable \ # tests-internal +tests-static += \ + tst-rseq-disable-static \ + tst-rseq-static \ + # tests-static + tests-time64 += \ tst-adjtimex-time64 \ tst-clock_adjtime-time64 \ @@ -411,6 +416,7 @@ $(objpfx)tst-sched-consts.out: ../sysdeps/unix/sysv/linux/tst-sched-consts.py $(objpfx)tst-sched-consts.out: $(sysdeps-linux-python-deps) tst-rseq-disable-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0 +tst-rseq-disable-static-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0 endif # $(subdir) == misc @@ -685,4 +691,8 @@ tests += \ tests-internal += \ tst-rseq-nptl \ # tests-internal + +tests-static += \ + tst-rseq-nptl-static \ + # tests-static endif diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h index 2d4243734c..41250c96d0 100644 --- a/sysdeps/unix/sysv/linux/dl-parse_auxv.h +++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h @@ -61,15 +61,9 @@ void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) #endif /* Get the rseq feature size, with a minimum of RSEQ_AREA_SIZE_INITIAL_USED - (20) for kernels that don't have AT_RSEQ_FEATURE_SIZE. Limit the feature - size to RSEQ_AREA_SIZE_MAX_USED (28) which fits the rseq area in 'struct - pthread' and represents the maximum feature size of currently released - kernels. Since no kernels currently cross the 32 bytes of the original - ABI, the semantics of a feature size of 32 or more are still undetermined. - */ - _rseq_size = MIN (MAX (auxv_values[AT_RSEQ_FEATURE_SIZE], - RSEQ_AREA_SIZE_INITIAL_USED), - RSEQ_AREA_SIZE_MAX_USED); + (20) for kernels that don't have AT_RSEQ_FEATURE_SIZE. */ + _rseq_size = MAX (auxv_values[AT_RSEQ_FEATURE_SIZE], + RSEQ_AREA_SIZE_INITIAL_USED); _rseq_align = MAX (auxv_values[AT_RSEQ_ALIGN], RSEQ_MIN_ALIGN); DL_PLATFORM_AUXV diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h index 00be15cfc8..f89e784243 100644 --- a/sysdeps/unix/sysv/linux/rseq-internal.h +++ b/sysdeps/unix/sysv/linux/rseq-internal.h @@ -26,6 +26,30 @@ #include #include #include +#include + +/* rseq area registered with the kernel. Use a custom definition here to + isolate from the system provided header which could lack some fields of the + Extended ABI. + + This is only used to get the field offsets and sizes, it should never be + used for direct object allocations. + + Access to fields of the Extended ABI beyond the 20 bytes of the original ABI + (after 'flags') must be gated by a check of the feature size. */ +struct rseq_area +{ + /* Original ABI. */ + uint32_t cpu_id_start; + uint32_t cpu_id; + uint64_t rseq_cs; + uint32_t flags; + /* Extended ABI. */ + uint32_t node_id; + uint32_t mm_cid; + /* Flexible array member to discourage direct object allocations. */ + char end[]; +}; /* Minimum size of the rseq area allocation required by the syscall. The actually used rseq feature size may be less (20 bytes initially). */ @@ -47,10 +71,12 @@ extern size_t _rseq_align attribute_hidden; /* Size of the active features in the rseq area. Populated from the auxiliary vector with a minimum of '20'. + Set to '0' on registration failure of the main thread. In .data.relro but not yet write-protected. */ extern unsigned int _rseq_size attribute_hidden; -/* Offset from the thread pointer to the rseq area. +/* Offset from the thread pointer to the rseq area, always set to allow + checking the registration status by reading the 'cpu_id' field. In .data.relro but not yet write-protected. */ extern ptrdiff_t _rseq_offset attribute_hidden; @@ -75,34 +101,35 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) { unsigned int size = __rseq_size; + /* The feature size can be smaller than the minimum rseq area size of 32 + bytes accepted by the syscall, if this is the case, bump the size of + the registration to the minimum. The 'extra TLS' block is always at + least 32 bytes. */ if (size < RSEQ_AREA_SIZE_INITIAL) - /* The initial implementation used only 20 bytes out of 32, - but still expected size 32. */ size = RSEQ_AREA_SIZE_INITIAL; /* Initialize the rseq fields that are read by the kernel on registration, there is no guarantee that struct pthread is cleared on all architectures. */ - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED); - THREAD_SETMEM (self, rseq_area.cpu_id_start, 0); - THREAD_SETMEM (self, rseq_area.rseq_cs, 0); - THREAD_SETMEM (self, rseq_area.flags, 0); + RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_UNINITIALIZED); + RSEQ_SETMEM (cpu_id_start, 0); + RSEQ_SETMEM (rseq_cs, 0); + RSEQ_SETMEM (flags, 0); - int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, - size, 0, RSEQ_SIG); + int ret = INTERNAL_SYSCALL_CALL (rseq, RSEQ_SELF (), size, 0, RSEQ_SIG); if (!INTERNAL_SYSCALL_ERROR_P (ret)) return true; } /* When rseq is disabled by tunables or the registration fails, inform userspace by setting 'cpu_id' to RSEQ_CPU_ID_REGISTRATION_FAILED. */ - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); return false; } #else /* RSEQ_SIG */ static inline bool rseq_register_current_thread (struct pthread *self, bool do_rseq) { - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); return false; } #endif /* RSEQ_SIG */ diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c index 9a2e2a5663..828b651320 100644 --- a/sysdeps/unix/sysv/linux/sched_getcpu.c +++ b/sysdeps/unix/sysv/linux/sched_getcpu.c @@ -19,6 +19,7 @@ #include #include #include +#include static int vsyscall_sched_getcpu (void) @@ -36,6 +37,6 @@ vsyscall_sched_getcpu (void) int sched_getcpu (void) { - int cpu_id = THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id); + int cpu_id = RSEQ_GETMEM_ONCE (cpu_id); return __glibc_likely (cpu_id >= 0) ? cpu_id : vsyscall_sched_getcpu (); } diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c new file mode 100644 index 0000000000..2687d13d3d --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c @@ -0,0 +1 @@ +#include "tst-rseq-disable.c" diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable.c b/sysdeps/unix/sysv/linux/tst-rseq-disable.c index 0e191ffeb9..8e9c11f653 100644 --- a/sysdeps/unix/sysv/linux/tst-rseq-disable.c +++ b/sysdeps/unix/sysv/linux/tst-rseq-disable.c @@ -26,32 +26,82 @@ #include #ifdef RSEQ_SIG +# include +# include +# include "tst-rseq.h" + +/* Used to test private registration with the rseq system call because glibc + rseq is disabled. */ +static __thread struct rseq local_rseq = { + .cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED, +}; + +/* Used to check if the address of the rseq area comes before or after the tls + blocks depending on the TLS model. */ +static __thread char tls_var __attribute__ ((tls_model ("initial-exec"))); /* Check that rseq can be registered and has not been taken by glibc. */ static void check_rseq_disabled (void) { - struct pthread *pd = THREAD_SELF; + struct rseq *rseq_abi = (struct rseq *) ((char *) __thread_pointer () + + __rseq_offset); +#if TLS_TCB_AT_TP + /* The rseq area block should come before the thread pointer and be at least + 32 bytes. */ + TEST_VERIFY (__rseq_offset <= -RSEQ_AREA_SIZE_INITIAL); + + /* The rseq area block should come before TLS variables. */ + TEST_VERIFY ((intptr_t) rseq_abi < (intptr_t) &tls_var); +#elif TLS_DTV_AT_TP + /* The rseq area block should come after the TCB, add the TLS block offset to + the rseq offset to get a value relative to the TCB and test that it's + non-negative. */ + TEST_VERIFY (__rseq_offset + TLS_TP_OFFSET >= 0); + + /* The rseq area block should come after TLS variables. */ + TEST_VERIFY ((intptr_t) rseq_abi > (intptr_t) &tls_var); +#else +# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined" +#endif + + /* __rseq_flags is unused and should always be '0'. */ TEST_COMPARE (__rseq_flags, 0); - TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset - == (char *) &pd->rseq_area); - TEST_COMPARE (__rseq_size, 0); - TEST_COMPARE ((int) pd->rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); - int ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area), - 0, RSEQ_SIG); + /* When rseq is not registered, __rseq_size should always be '0'. */ + TEST_COMPARE (__rseq_size, 0); + + /* When rseq is not registered, the 'cpu_id' field should be set to + RSEQ_CPU_ID_REGISTRATION_FAILED. */ + TEST_COMPARE ((int) rseq_abi->cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + + /* Test a rseq registration which should succeed since the internal + registration is disabled. */ + int ret = syscall (__NR_rseq, &local_rseq, RSEQ_AREA_SIZE_INITIAL, 0, RSEQ_SIG); if (ret == 0) { - ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area), + /* A successful registration should set the cpu id. */ + TEST_VERIFY (local_rseq.cpu_id >= 0); + + /* Test we can also unregister rseq. */ + ret = syscall (__NR_rseq, &local_rseq, RSEQ_AREA_SIZE_INITIAL, RSEQ_FLAG_UNREGISTER, RSEQ_SIG); TEST_COMPARE (ret, 0); - pd->rseq_area.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; } else { - TEST_VERIFY (errno != -EINVAL); - TEST_VERIFY (errno != -EBUSY); + /* Check if we failed with EINVAL which would mean an invalid rseq flags, + a mis-aligned rseq area address or an incorrect rseq size. */ + TEST_VERIFY (errno != EINVAL); + + /* Check if we failed with EBUSY which means an existing rseq + registration. */ + TEST_VERIFY (errno != EBUSY); + + /* Check if we failed with EFAULT which means an invalid rseq area + address. */ + TEST_VERIFY (errno != EFAULT); } } @@ -71,6 +121,13 @@ proc_func (void *ignored) static int do_test (void) { + printf ("info: __rseq_size: %u\n", __rseq_size); + printf ("info: __rseq_offset: %td\n", __rseq_offset); + printf ("info: __rseq_flags: %u\n", __rseq_flags); + printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n", + getauxval (AT_RSEQ_FEATURE_SIZE)); + printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN)); + puts ("info: checking main thread"); check_rseq_disabled (); diff --git a/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c new file mode 100644 index 0000000000..6e2c923bb9 --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c @@ -0,0 +1 @@ +#include "tst-rseq-nptl.c" diff --git a/sysdeps/unix/sysv/linux/tst-rseq-static.c b/sysdeps/unix/sysv/linux/tst-rseq-static.c new file mode 100644 index 0000000000..1d97f3bd3d --- /dev/null +++ b/sysdeps/unix/sysv/linux/tst-rseq-static.c @@ -0,0 +1 @@ +#include "tst-rseq.c" diff --git a/sysdeps/unix/sysv/linux/tst-rseq.c b/sysdeps/unix/sysv/linux/tst-rseq.c index b152280eff..00181cfefb 100644 --- a/sysdeps/unix/sysv/linux/tst-rseq.c +++ b/sysdeps/unix/sysv/linux/tst-rseq.c @@ -19,6 +19,8 @@ not linked against libpthread. */ #include +#include +#include #include #include #include @@ -32,25 +34,82 @@ # include # include # include +# include +# include # include "tst-rseq.h" +/* Used to check if the address of the rseq area comes before or after the tls + blocks depending on the TLS model. */ +static __thread char tls_var __attribute__ ((tls_model ("initial-exec"))); + static void do_rseq_main_test (void) { - struct pthread *pd = THREAD_SELF; - size_t rseq_feature_size = MIN (MAX (getauxval (AT_RSEQ_FEATURE_SIZE), - RSEQ_AREA_SIZE_INITIAL_USED), - RSEQ_AREA_SIZE_MAX_USED); + size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), RSEQ_MIN_ALIGN); + size_t rseq_feature_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), + RSEQ_AREA_SIZE_INITIAL_USED); + size_t rseq_alloc_size = roundup (MAX (rseq_feature_size, + RSEQ_AREA_SIZE_INITIAL_USED), rseq_align); + struct rseq *rseq_abi = __thread_pointer () + __rseq_offset; TEST_VERIFY_EXIT (rseq_thread_registered ()); + + /* __rseq_flags is unused and should always be '0'. */ TEST_COMPARE (__rseq_flags, 0); - TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset - == (char *) &pd->rseq_area); + + /* When rseq is registered, __rseq_size should report the feature size. */ TEST_COMPARE (__rseq_size, rseq_feature_size); + + /* When rseq is registered, the 'cpu_id' field should be set to a valid cpu + * number. */ + TEST_VERIFY ((int32_t) rseq_abi->cpu_id >= 0); + + /* The rseq area address must be aligned. */ + TEST_VERIFY (((unsigned long) rseq_abi % rseq_align) == 0); + +#if TLS_TCB_AT_TP + /* The rseq area block should come before the thread pointer and be at least + 32 bytes. */ + TEST_VERIFY (__rseq_offset <= -RSEQ_AREA_SIZE_INITIAL); + + /* The rseq area block should come before TLS variables. */ + TEST_VERIFY ((intptr_t) rseq_abi < (intptr_t) &tls_var); +#elif TLS_DTV_AT_TP + /* The rseq area block should come after the TCB, add the TLS block offset to + the rseq offset to get a value relative to the TCB and test that it's + non-negative. */ + TEST_VERIFY (__rseq_offset + TLS_TP_OFFSET >= 0); + + /* The rseq area block should come after TLS variables. */ + TEST_VERIFY ((intptr_t) rseq_abi > (intptr_t) &tls_var); +#else +# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined" +#endif + + /* Test a rseq registration with the same arguments as the internal + registration which should fail with errno == EBUSY. */ + TEST_VERIFY (((unsigned long) rseq_abi % rseq_align) == 0); + TEST_VERIFY (__rseq_size <= rseq_alloc_size); + int ret = syscall (__NR_rseq, rseq_abi, rseq_alloc_size, 0, RSEQ_SIG); + TEST_VERIFY (ret != 0); + TEST_COMPARE (errno, EBUSY); +} + +static void * +thread_func (void *ignored) +{ + do_rseq_main_test (); + return NULL; } static void -do_rseq_test (void) +proc_func (void *ignored) +{ + do_rseq_main_test (); +} + +static int +do_test (void) { if (!rseq_available ()) { @@ -62,21 +121,27 @@ do_rseq_test (void) printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n", getauxval (AT_RSEQ_FEATURE_SIZE)); printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN)); + + puts ("info: checking main thread"); do_rseq_main_test (); + + puts ("info: checking main thread (2)"); + do_rseq_main_test (); + + puts ("info: checking new thread"); + xpthread_join (xpthread_create (NULL, thread_func, NULL)); + + puts ("info: checking subprocess"); + support_isolate_in_subprocess (proc_func, NULL); + + return 0; } #else /* RSEQ_SIG */ -static void -do_rseq_test (void) +static int +do_test (void) { FAIL_UNSUPPORTED ("glibc does not define RSEQ_SIG, skipping test"); } #endif /* RSEQ_SIG */ -static int -do_test (void) -{ - do_rseq_test (); - return 0; -} - #include diff --git a/sysdeps/unix/sysv/linux/tst-rseq.h b/sysdeps/unix/sysv/linux/tst-rseq.h index 15f512a983..812accc8f9 100644 --- a/sysdeps/unix/sysv/linux/tst-rseq.h +++ b/sysdeps/unix/sysv/linux/tst-rseq.h @@ -28,7 +28,7 @@ static inline bool rseq_thread_registered (void) { - return THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id) >= 0; + return RSEQ_GETMEM_ONCE (cpu_id) >= 0; } static inline int