1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

Merge patch series "acct: don't allow access to internal filesystems"

Christian Brauner <brauner@kernel.org> says:

In [1] it was reported that the acct(2) system call can be used to
trigger a NULL deref in cases where it is set to write to a file that
triggers an internal lookup.

This can e.g., happen when pointing acct(2) to /sys/power/resume. At the
point the where the write to this file happens the calling task has
already exited and called exit_fs() but an internal lookup might be
triggered through lookup_bdev(). This may trigger a NULL-deref
when accessing current->fs.

This series does two things:

- Reorganize the code so that the the final write happens from the
  workqueue but with the caller's credentials. This preserves the
  (strange) permission model and has almost no regression risk.

- Block access to kernel internal filesystems as well as procfs and
  sysfs in the first place.

This api should stop to exist imho.

Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1]

* patches from https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org:
  acct: block access to kernel internal filesystems
  acct: perform last write from workqueue

Link: https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2025-02-12 12:24:28 +01:00
commit 240189294f
No known key found for this signature in database
GPG key ID: 91C61BC06578DCA2

View file

@ -103,48 +103,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
int active;
bool active;
bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
acct_t ac;
};
static void do_acct_process(struct bsd_acct_struct *acct);
static void fill_ac(struct bsd_acct_struct *acct);
static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct)
static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
if (time_is_after_jiffies(acct->needcheck))
goto out;
if (!acct->check_space)
return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
acct->active = 0;
acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
acct->active = 1;
acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
out:
return acct->active;
}
@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
do_acct_process(acct);
/*
* Fill the accounting struct with the exiting task's info
* before punting to the workqueue.
*/
fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
/* We were fired by acct_pin_kill() which holds acct->lock. */
acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
/* Exclude kernel kernel internal filesystems. */
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
kfree(acct);
filp_close(file, NULL);
return -EINVAL;
}
/* Exclude procfs and sysfs. */
if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
kfree(acct);
filp_close(file, NULL);
return -EINVAL;
}
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
static void fill_ac(acct_t *ac)
static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
struct file *file = acct->file;
acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
lockdep_assert_held(&acct->lock);
if (time_is_after_jiffies(acct->needcheck)) {
acct->check_space = false;
/* Don't fill in @ac if nothing will be written. */
if (!acct->active)
return;
} else {
acct->check_space = true;
}
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
}
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct)
{
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
struct file *file = acct->file;
/*
* Accounting records are not subject to resource limits.
*/
flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct))
goto out;
fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
ac->ac_uid16 = ac->ac_uid;
ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
ac.ac_pid = task_tgid_nr_ns(current, ns);
ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
ns);
ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
}
static void acct_write_process(struct bsd_acct_struct *acct)
{
struct file *file = acct->file;
const struct cred *cred;
acct_t *ac = &acct->ac;
/* Perform file operations on behalf of whoever enabled accounting */
cred = override_creds(file->f_cred);
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
* First check to see if there is enough free_space to continue
* the process accounting system. Then get freeze protection. If
* the fs is frozen, just skip the write as we could deadlock
* the system otherwise.
*/
if (file_start_write_trylock(file)) {
if (check_free_space(acct) && file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
__kernel_write(file, &ac, sizeof(acct_t), &pos);
__kernel_write(file, ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
revert_creds(cred);
}
static void do_acct_process(struct bsd_acct_struct *acct)
{
unsigned long flim;
/* Accounting records are not subject to resource limits. */
flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
fill_ac(acct);
acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
/**