Merge patch series "acct: don't allow access to internal filesystems"
Christian Brauner <brauner@kernel.org> says: In [1] it was reported that the acct(2) system call can be used to trigger a NULL deref in cases where it is set to write to a file that triggers an internal lookup. This can e.g., happen when pointing acct(2) to /sys/power/resume. At the point the where the write to this file happens the calling task has already exited and called exit_fs() but an internal lookup might be triggered through lookup_bdev(). This may trigger a NULL-deref when accessing current->fs. This series does two things: - Reorganize the code so that the the final write happens from the workqueue but with the caller's credentials. This preserves the (strange) permission model and has almost no regression risk. - Block access to kernel internal filesystems as well as procfs and sysfs in the first place. This api should stop to exist imho. Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1] * patches from https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org: acct: block access to kernel internal filesystems acct: perform last write from workqueue Link: https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
commit
240189294f
1 changed files with 84 additions and 50 deletions
134
kernel/acct.c
134
kernel/acct.c
|
@ -103,48 +103,50 @@ struct bsd_acct_struct {
|
|||
atomic_long_t count;
|
||||
struct rcu_head rcu;
|
||||
struct mutex lock;
|
||||
int active;
|
||||
bool active;
|
||||
bool check_space;
|
||||
unsigned long needcheck;
|
||||
struct file *file;
|
||||
struct pid_namespace *ns;
|
||||
struct work_struct work;
|
||||
struct completion done;
|
||||
acct_t ac;
|
||||
};
|
||||
|
||||
static void do_acct_process(struct bsd_acct_struct *acct);
|
||||
static void fill_ac(struct bsd_acct_struct *acct);
|
||||
static void acct_write_process(struct bsd_acct_struct *acct);
|
||||
|
||||
/*
|
||||
* Check the amount of free space and suspend/resume accordingly.
|
||||
*/
|
||||
static int check_free_space(struct bsd_acct_struct *acct)
|
||||
static bool check_free_space(struct bsd_acct_struct *acct)
|
||||
{
|
||||
struct kstatfs sbuf;
|
||||
|
||||
if (time_is_after_jiffies(acct->needcheck))
|
||||
goto out;
|
||||
if (!acct->check_space)
|
||||
return acct->active;
|
||||
|
||||
/* May block */
|
||||
if (vfs_statfs(&acct->file->f_path, &sbuf))
|
||||
goto out;
|
||||
return acct->active;
|
||||
|
||||
if (acct->active) {
|
||||
u64 suspend = sbuf.f_blocks * SUSPEND;
|
||||
do_div(suspend, 100);
|
||||
if (sbuf.f_bavail <= suspend) {
|
||||
acct->active = 0;
|
||||
acct->active = false;
|
||||
pr_info("Process accounting paused\n");
|
||||
}
|
||||
} else {
|
||||
u64 resume = sbuf.f_blocks * RESUME;
|
||||
do_div(resume, 100);
|
||||
if (sbuf.f_bavail >= resume) {
|
||||
acct->active = 1;
|
||||
acct->active = true;
|
||||
pr_info("Process accounting resumed\n");
|
||||
}
|
||||
}
|
||||
|
||||
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
|
||||
out:
|
||||
return acct->active;
|
||||
}
|
||||
|
||||
|
@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
|
|||
{
|
||||
struct bsd_acct_struct *acct = to_acct(pin);
|
||||
mutex_lock(&acct->lock);
|
||||
do_acct_process(acct);
|
||||
/*
|
||||
* Fill the accounting struct with the exiting task's info
|
||||
* before punting to the workqueue.
|
||||
*/
|
||||
fill_ac(acct);
|
||||
schedule_work(&acct->work);
|
||||
wait_for_completion(&acct->done);
|
||||
cmpxchg(&acct->ns->bacct, pin, NULL);
|
||||
|
@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
|
|||
{
|
||||
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
|
||||
struct file *file = acct->file;
|
||||
|
||||
/* We were fired by acct_pin_kill() which holds acct->lock. */
|
||||
acct_write_process(acct);
|
||||
if (file->f_op->flush)
|
||||
file->f_op->flush(file, NULL);
|
||||
__fput_sync(file);
|
||||
|
@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
|
|||
return -EACCES;
|
||||
}
|
||||
|
||||
/* Exclude kernel kernel internal filesystems. */
|
||||
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
|
||||
kfree(acct);
|
||||
filp_close(file, NULL);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Exclude procfs and sysfs. */
|
||||
if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
|
||||
kfree(acct);
|
||||
filp_close(file, NULL);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!(file->f_mode & FMODE_CAN_WRITE)) {
|
||||
kfree(acct);
|
||||
filp_close(file, NULL);
|
||||
|
@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
|
|||
* do_exit() or when switching to a different output file.
|
||||
*/
|
||||
|
||||
static void fill_ac(acct_t *ac)
|
||||
static void fill_ac(struct bsd_acct_struct *acct)
|
||||
{
|
||||
struct pacct_struct *pacct = ¤t->signal->pacct;
|
||||
struct file *file = acct->file;
|
||||
acct_t *ac = &acct->ac;
|
||||
u64 elapsed, run_time;
|
||||
time64_t btime;
|
||||
struct tty_struct *tty;
|
||||
|
||||
lockdep_assert_held(&acct->lock);
|
||||
|
||||
if (time_is_after_jiffies(acct->needcheck)) {
|
||||
acct->check_space = false;
|
||||
|
||||
/* Don't fill in @ac if nothing will be written. */
|
||||
if (!acct->active)
|
||||
return;
|
||||
} else {
|
||||
acct->check_space = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fill the accounting struct with the needed info as recorded
|
||||
* by the different kernel functions.
|
||||
|
@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
|
|||
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
|
||||
ac->ac_exitcode = pacct->ac_exitcode;
|
||||
spin_unlock_irq(¤t->sighand->siglock);
|
||||
}
|
||||
/*
|
||||
* do_acct_process does all actual work. Caller holds the reference to file.
|
||||
*/
|
||||
static void do_acct_process(struct bsd_acct_struct *acct)
|
||||
{
|
||||
acct_t ac;
|
||||
unsigned long flim;
|
||||
const struct cred *orig_cred;
|
||||
struct file *file = acct->file;
|
||||
|
||||
/*
|
||||
* Accounting records are not subject to resource limits.
|
||||
*/
|
||||
flim = rlimit(RLIMIT_FSIZE);
|
||||
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
|
||||
/* Perform file operations on behalf of whoever enabled accounting */
|
||||
orig_cred = override_creds(file->f_cred);
|
||||
|
||||
/*
|
||||
* First check to see if there is enough free_space to continue
|
||||
* the process accounting system.
|
||||
*/
|
||||
if (!check_free_space(acct))
|
||||
goto out;
|
||||
|
||||
fill_ac(&ac);
|
||||
/* we really need to bite the bullet and change layout */
|
||||
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
|
||||
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
|
||||
ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
|
||||
ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
|
||||
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
|
||||
/* backward-compatible 16 bit fields */
|
||||
ac.ac_uid16 = ac.ac_uid;
|
||||
ac.ac_gid16 = ac.ac_gid;
|
||||
ac->ac_uid16 = ac->ac_uid;
|
||||
ac->ac_gid16 = ac->ac_gid;
|
||||
#elif ACCT_VERSION == 3
|
||||
{
|
||||
struct pid_namespace *ns = acct->ns;
|
||||
|
||||
ac.ac_pid = task_tgid_nr_ns(current, ns);
|
||||
ac->ac_pid = task_tgid_nr_ns(current, ns);
|
||||
rcu_read_lock();
|
||||
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
|
||||
ns);
|
||||
ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void acct_write_process(struct bsd_acct_struct *acct)
|
||||
{
|
||||
struct file *file = acct->file;
|
||||
const struct cred *cred;
|
||||
acct_t *ac = &acct->ac;
|
||||
|
||||
/* Perform file operations on behalf of whoever enabled accounting */
|
||||
cred = override_creds(file->f_cred);
|
||||
|
||||
/*
|
||||
* Get freeze protection. If the fs is frozen, just skip the write
|
||||
* as we could deadlock the system otherwise.
|
||||
* First check to see if there is enough free_space to continue
|
||||
* the process accounting system. Then get freeze protection. If
|
||||
* the fs is frozen, just skip the write as we could deadlock
|
||||
* the system otherwise.
|
||||
*/
|
||||
if (file_start_write_trylock(file)) {
|
||||
if (check_free_space(acct) && file_start_write_trylock(file)) {
|
||||
/* it's been opened O_APPEND, so position is irrelevant */
|
||||
loff_t pos = 0;
|
||||
__kernel_write(file, &ac, sizeof(acct_t), &pos);
|
||||
__kernel_write(file, ac, sizeof(acct_t), &pos);
|
||||
file_end_write(file);
|
||||
}
|
||||
out:
|
||||
|
||||
revert_creds(cred);
|
||||
}
|
||||
|
||||
static void do_acct_process(struct bsd_acct_struct *acct)
|
||||
{
|
||||
unsigned long flim;
|
||||
|
||||
/* Accounting records are not subject to resource limits. */
|
||||
flim = rlimit(RLIMIT_FSIZE);
|
||||
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
|
||||
fill_ac(acct);
|
||||
acct_write_process(acct);
|
||||
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
|
||||
revert_creds(orig_cred);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Add table
Reference in a new issue