\n
-----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEq1nRK9aeMoq1VSgcnJ2qBz9kQNkFAmePs7oACgkQnJ2qBz9k QNmHuAf9GkLnY5u1/81xP5V9ukZ4N2yeMW0dydLS5cjWj/St5ELeMAza3jeqtJtD j36vbnmy2c5pPaGLAK8BJpMXT/R2TkmmKD004zcfqF2S3SgbGzdgO1zMZzq9KJpM woRKZtLuglDajedsDEBBcKotBhlN2+C/sQlFuL1mX4zitk9ajr0qYUB1+JqOeg5f qwPsDLT077ADpxd7lVIMcm+OqbduP5KWkBKYHpn7lJcLe1eqVMMzceJroW42zhVG Dq8Iln26bbU9Wx6FSPFCUcHEzHRHUfXmu07HN9U0X++0QgWjrmBQQLooGFB/bR4a edBrPpVas6xE4/brjgFX3gOKtv8xYg== =ewDV -----END PGP SIGNATURE----- Merge tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs Pull fsnotify pre-content notification support from Jan Kara: "This introduces a new fsnotify event (FS_PRE_ACCESS) that gets generated before a file contents is accessed. The event is synchronous so if there is listener for this event, the kernel waits for reply. On success the execution continues as usual, on failure we propagate the error to userspace. This allows userspace to fill in file content on demand from slow storage. The context in which the events are generated has been picked so that we don't hold any locks and thus there's no risk of a deadlock for the userspace handler. The new pre-content event is available only for users with global CAP_SYS_ADMIN capability (similarly to other parts of fanotify functionality) and it is an administrator responsibility to make sure the userspace event handler doesn't do stupid stuff that can DoS the system. Based on your feedback from the last submission, fsnotify code has been improved and now file->f_mode encodes whether pre-content event needs to be generated for the file so the fast path when nobody wants pre-content event for the file just grows the additional file->f_mode check. As a bonus this also removes the checks whether the old FS_ACCESS event needs to be generated from the fast path. Also the place where the event is generated during page fault has been moved so now filemap_fault() generates the event if and only if there is no uptodate folio in the page cache. Also we have dropped FS_PRE_MODIFY event as current real-world users of the pre-content functionality don't really use it so let's start with the minimal useful feature set" * tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs: (21 commits) fanotify: Fix crash in fanotify_init(2) fs: don't block write during exec on pre-content watched files fs: enable pre-content events on supported file systems ext4: add pre-content fsnotify hook for DAX faults btrfs: disable defrag on pre-content watched files xfs: add pre-content fsnotify hook for DAX faults fsnotify: generate pre-content permission event on page fault mm: don't allow huge faults for files with pre content watches fanotify: disable readahead if we have pre-content watches fanotify: allow to set errno in FAN_DENY permission response fanotify: report file range info with pre-content events fanotify: introduce FAN_PRE_ACCESS permission event fsnotify: generate pre-content permission event on truncate fsnotify: pass optional file access range in pre-content event fsnotify: introduce pre-content permission events fanotify: reserve event bit of deprecated FAN_DIR_MODIFY fanotify: rename a misnamed constant fanotify: don't skip extra event info if no info_mode is set fsnotify: check if file is actually being watched for pre-content events on open fsnotify: opt-in for permission events at file open time ...
This commit is contained in:
commit
8883957b3c
28 changed files with 669 additions and 106 deletions
|
@ -1257,7 +1257,7 @@ out_free_interp:
|
|||
}
|
||||
reloc_func_desc = interp_load_addr;
|
||||
|
||||
allow_write_access(interpreter);
|
||||
exe_file_allow_write_access(interpreter);
|
||||
fput(interpreter);
|
||||
|
||||
kfree(interp_elf_ex);
|
||||
|
@ -1354,7 +1354,7 @@ out_free_dentry:
|
|||
kfree(interp_elf_ex);
|
||||
kfree(interp_elf_phdata);
|
||||
out_free_file:
|
||||
allow_write_access(interpreter);
|
||||
exe_file_allow_write_access(interpreter);
|
||||
if (interpreter)
|
||||
fput(interpreter);
|
||||
out_free_ph:
|
||||
|
|
|
@ -394,7 +394,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
|
|||
goto error;
|
||||
}
|
||||
|
||||
allow_write_access(interpreter);
|
||||
exe_file_allow_write_access(interpreter);
|
||||
fput(interpreter);
|
||||
interpreter = NULL;
|
||||
}
|
||||
|
@ -467,7 +467,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
|
|||
|
||||
error:
|
||||
if (interpreter) {
|
||||
allow_write_access(interpreter);
|
||||
exe_file_allow_write_access(interpreter);
|
||||
fput(interpreter);
|
||||
}
|
||||
kfree(interpreter_name);
|
||||
|
|
|
@ -2544,6 +2544,15 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
|
|||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't allow defrag on pre-content watched files, as it could
|
||||
* populate the page cache with 0's via readahead.
|
||||
*/
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (argp) {
|
||||
if (copy_from_user(&range, argp, sizeof(range))) {
|
||||
ret = -EFAULT;
|
||||
|
|
|
@ -961,7 +961,7 @@ static int btrfs_fill_super(struct super_block *sb,
|
|||
#endif
|
||||
sb->s_xattr = btrfs_xattr_handlers;
|
||||
sb->s_time_gran = 1;
|
||||
sb->s_iflags |= SB_I_CGROUPWB;
|
||||
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
|
||||
|
||||
err = super_setup_bdi(sb);
|
||||
if (err) {
|
||||
|
|
|
@ -913,7 +913,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
|
|||
path_noexec(&file->f_path))
|
||||
return ERR_PTR(-EACCES);
|
||||
|
||||
err = deny_write_access(file);
|
||||
err = exe_file_deny_write_access(file);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
|
@ -928,7 +928,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
|
|||
* Returns ERR_PTR on failure or allocated struct file on success.
|
||||
*
|
||||
* As this is a wrapper for the internal do_open_execat(), callers
|
||||
* must call allow_write_access() before fput() on release. Also see
|
||||
* must call exe_file_allow_write_access() before fput() on release. Also see
|
||||
* do_close_execat().
|
||||
*/
|
||||
struct file *open_exec(const char *name)
|
||||
|
@ -1493,7 +1493,7 @@ static void do_close_execat(struct file *file)
|
|||
{
|
||||
if (!file)
|
||||
return;
|
||||
allow_write_access(file);
|
||||
exe_file_allow_write_access(file);
|
||||
fput(file);
|
||||
}
|
||||
|
||||
|
@ -1822,7 +1822,7 @@ static int exec_binprm(struct linux_binprm *bprm)
|
|||
bprm->file = bprm->interpreter;
|
||||
bprm->interpreter = NULL;
|
||||
|
||||
allow_write_access(exec);
|
||||
exe_file_allow_write_access(exec);
|
||||
if (unlikely(bprm->have_execfd)) {
|
||||
if (bprm->executable) {
|
||||
fput(exec);
|
||||
|
|
|
@ -756,6 +756,9 @@ retry:
|
|||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
} else {
|
||||
result = filemap_fsnotify_fault(vmf);
|
||||
if (unlikely(result))
|
||||
return result;
|
||||
filemap_invalidate_lock_shared(mapping);
|
||||
}
|
||||
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
|
||||
|
|
|
@ -5301,6 +5301,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
|
|||
/* i_version is always enabled now */
|
||||
sb->s_flags |= SB_I_VERSION;
|
||||
|
||||
/* HSM events are allowed by default. */
|
||||
sb->s_iflags |= SB_I_ALLOW_HSM;
|
||||
|
||||
err = ext4_check_feature_compatibility(sb, es, silent);
|
||||
if (err)
|
||||
goto failed_mount;
|
||||
|
|
|
@ -1158,10 +1158,10 @@ static int __init fcntl_init(void)
|
|||
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
|
||||
* is defined as O_NONBLOCK on some platforms and not on others.
|
||||
*/
|
||||
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
|
||||
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
|
||||
HWEIGHT32(
|
||||
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
|
||||
__FMODE_EXEC | __FMODE_NONOTIFY));
|
||||
__FMODE_EXEC));
|
||||
|
||||
fasync_cache = kmem_cache_create("fasync_cache",
|
||||
sizeof(struct fasync_struct), 0,
|
||||
|
|
|
@ -223,7 +223,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
|
|||
struct fanotify_perm_event *event,
|
||||
struct fsnotify_iter_info *iter_info)
|
||||
{
|
||||
int ret;
|
||||
int ret, errno;
|
||||
|
||||
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
|
||||
|
||||
|
@ -262,14 +262,23 @@ static int fanotify_get_response(struct fsnotify_group *group,
|
|||
ret = 0;
|
||||
break;
|
||||
case FAN_DENY:
|
||||
/* Check custom errno from pre-content events */
|
||||
errno = fanotify_get_response_errno(event->response);
|
||||
if (errno) {
|
||||
ret = -errno;
|
||||
break;
|
||||
}
|
||||
fallthrough;
|
||||
default:
|
||||
ret = -EPERM;
|
||||
}
|
||||
|
||||
/* Check if the response should be audited */
|
||||
if (event->response & FAN_AUDIT)
|
||||
audit_fanotify(event->response & ~FAN_AUDIT,
|
||||
&event->audit_rule);
|
||||
if (event->response & FAN_AUDIT) {
|
||||
u32 response = event->response &
|
||||
(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS);
|
||||
audit_fanotify(response & ~FAN_AUDIT, &event->audit_rule);
|
||||
}
|
||||
|
||||
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
|
||||
group, event, ret);
|
||||
|
@ -548,9 +557,13 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
|
|||
return &pevent->fae;
|
||||
}
|
||||
|
||||
static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
|
||||
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
|
||||
int data_type,
|
||||
gfp_t gfp)
|
||||
{
|
||||
const struct path *path = fsnotify_data_path(data, data_type);
|
||||
const struct file_range *range =
|
||||
fsnotify_data_file_range(data, data_type);
|
||||
struct fanotify_perm_event *pevent;
|
||||
|
||||
pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp);
|
||||
|
@ -564,6 +577,9 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
|
|||
pevent->hdr.len = 0;
|
||||
pevent->state = FAN_EVENT_INIT;
|
||||
pevent->path = *path;
|
||||
/* NULL ppos means no range info */
|
||||
pevent->ppos = range ? &range->pos : NULL;
|
||||
pevent->count = range ? range->count : 0;
|
||||
path_get(path);
|
||||
|
||||
return &pevent->fae;
|
||||
|
@ -801,7 +817,7 @@ static struct fanotify_event *fanotify_alloc_event(
|
|||
old_memcg = set_active_memcg(group->memcg);
|
||||
|
||||
if (fanotify_is_perm_event(mask)) {
|
||||
event = fanotify_alloc_perm_event(path, gfp);
|
||||
event = fanotify_alloc_perm_event(data, data_type, gfp);
|
||||
} else if (fanotify_is_error_event(mask)) {
|
||||
event = fanotify_alloc_error_event(group, fsid, data,
|
||||
data_type, &hash);
|
||||
|
@ -909,8 +925,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
|
|||
BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
|
||||
BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
|
||||
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
|
||||
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
|
||||
|
||||
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
|
||||
BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
|
||||
|
||||
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
|
||||
mask, data, data_type, dir);
|
||||
|
|
|
@ -425,6 +425,8 @@ FANOTIFY_PE(struct fanotify_event *event)
|
|||
struct fanotify_perm_event {
|
||||
struct fanotify_event fae;
|
||||
struct path path;
|
||||
const loff_t *ppos; /* optional file range info */
|
||||
size_t count;
|
||||
u32 response; /* userspace answer to the event */
|
||||
unsigned short state; /* state of the event */
|
||||
int fd; /* fd we passed to userspace for this event */
|
||||
|
@ -446,6 +448,14 @@ static inline bool fanotify_is_perm_event(u32 mask)
|
|||
mask & FANOTIFY_PERM_EVENTS;
|
||||
}
|
||||
|
||||
static inline bool fanotify_event_has_access_range(struct fanotify_event *event)
|
||||
{
|
||||
if (!(event->mask & FANOTIFY_PRE_CONTENT_EVENTS))
|
||||
return false;
|
||||
|
||||
return FANOTIFY_PERM(event)->ppos;
|
||||
}
|
||||
|
||||
static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
|
||||
{
|
||||
return container_of(fse, struct fanotify_event, fse);
|
||||
|
@ -518,3 +528,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
|
|||
|
||||
return mflags;
|
||||
}
|
||||
|
||||
static inline u32 fanotify_get_response_errno(int res)
|
||||
{
|
||||
return (res >> FAN_ERRNO_SHIFT) & FAN_ERRNO_MASK;
|
||||
}
|
||||
|
|
|
@ -100,8 +100,7 @@ static void __init fanotify_sysctls_init(void)
|
|||
*
|
||||
* Internal and external open flags are stored together in field f_flags of
|
||||
* struct file. Only external open flags shall be allowed in event_f_flags.
|
||||
* Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
|
||||
* excluded.
|
||||
* Internal flags like FMODE_EXEC shall be excluded.
|
||||
*/
|
||||
#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
|
||||
O_ACCMODE | O_APPEND | O_NONBLOCK | \
|
||||
|
@ -118,10 +117,12 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
|
|||
#define FANOTIFY_EVENT_ALIGN 4
|
||||
#define FANOTIFY_FID_INFO_HDR_LEN \
|
||||
(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
|
||||
#define FANOTIFY_PIDFD_INFO_HDR_LEN \
|
||||
#define FANOTIFY_PIDFD_INFO_LEN \
|
||||
sizeof(struct fanotify_event_info_pidfd)
|
||||
#define FANOTIFY_ERROR_INFO_LEN \
|
||||
(sizeof(struct fanotify_event_info_error))
|
||||
#define FANOTIFY_RANGE_INFO_LEN \
|
||||
(sizeof(struct fanotify_event_info_range))
|
||||
|
||||
static int fanotify_fid_info_len(int fh_len, int name_len)
|
||||
{
|
||||
|
@ -159,9 +160,6 @@ static size_t fanotify_event_len(unsigned int info_mode,
|
|||
int fh_len;
|
||||
int dot_len = 0;
|
||||
|
||||
if (!info_mode)
|
||||
return event_len;
|
||||
|
||||
if (fanotify_is_error_event(event->mask))
|
||||
event_len += FANOTIFY_ERROR_INFO_LEN;
|
||||
|
||||
|
@ -176,14 +174,17 @@ static size_t fanotify_event_len(unsigned int info_mode,
|
|||
dot_len = 1;
|
||||
}
|
||||
|
||||
if (info_mode & FAN_REPORT_PIDFD)
|
||||
event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
|
||||
|
||||
if (fanotify_event_has_object_fh(event)) {
|
||||
fh_len = fanotify_event_object_fh_len(event);
|
||||
event_len += fanotify_fid_info_len(fh_len, dot_len);
|
||||
}
|
||||
|
||||
if (info_mode & FAN_REPORT_PIDFD)
|
||||
event_len += FANOTIFY_PIDFD_INFO_LEN;
|
||||
|
||||
if (fanotify_event_has_access_range(event))
|
||||
event_len += FANOTIFY_RANGE_INFO_LEN;
|
||||
|
||||
return event_len;
|
||||
}
|
||||
|
||||
|
@ -258,12 +259,11 @@ static int create_fd(struct fsnotify_group *group, const struct path *path,
|
|||
return client_fd;
|
||||
|
||||
/*
|
||||
* we need a new file handle for the userspace program so it can read even if it was
|
||||
* originally opened O_WRONLY.
|
||||
* We provide an fd for the userspace program, so it could access the
|
||||
* file without generating fanotify events itself.
|
||||
*/
|
||||
new_file = dentry_open(path,
|
||||
group->fanotify_data.f_flags | __FMODE_NONOTIFY,
|
||||
current_cred());
|
||||
new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
|
||||
current_cred());
|
||||
if (IS_ERR(new_file)) {
|
||||
put_unused_fd(client_fd);
|
||||
client_fd = PTR_ERR(new_file);
|
||||
|
@ -327,11 +327,12 @@ static int process_access_response(struct fsnotify_group *group,
|
|||
struct fanotify_perm_event *event;
|
||||
int fd = response_struct->fd;
|
||||
u32 response = response_struct->response;
|
||||
int errno = fanotify_get_response_errno(response);
|
||||
int ret = info_len;
|
||||
struct fanotify_response_info_audit_rule friar;
|
||||
|
||||
pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
|
||||
group, fd, response, info, info_len);
|
||||
pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
|
||||
__func__, group, fd, response, errno, info, info_len);
|
||||
/*
|
||||
* make sure the response is valid, if invalid we do nothing and either
|
||||
* userspace can send a valid response or we will clean it up after the
|
||||
|
@ -342,7 +343,31 @@ static int process_access_response(struct fsnotify_group *group,
|
|||
|
||||
switch (response & FANOTIFY_RESPONSE_ACCESS) {
|
||||
case FAN_ALLOW:
|
||||
if (errno)
|
||||
return -EINVAL;
|
||||
break;
|
||||
case FAN_DENY:
|
||||
/* Custom errno is supported only for pre-content groups */
|
||||
if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Limit errno to values expected on open(2)/read(2)/write(2)
|
||||
* of regular files.
|
||||
*/
|
||||
switch (errno) {
|
||||
case 0:
|
||||
case EIO:
|
||||
case EPERM:
|
||||
case EBUSY:
|
||||
case ETXTBSY:
|
||||
case EAGAIN:
|
||||
case ENOSPC:
|
||||
case EDQUOT:
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
|
@ -506,7 +531,7 @@ static int copy_pidfd_info_to_user(int pidfd,
|
|||
size_t count)
|
||||
{
|
||||
struct fanotify_event_info_pidfd info = { };
|
||||
size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
|
||||
size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
|
||||
|
||||
if (WARN_ON_ONCE(info_len > count))
|
||||
return -EFAULT;
|
||||
|
@ -521,6 +546,30 @@ static int copy_pidfd_info_to_user(int pidfd,
|
|||
return info_len;
|
||||
}
|
||||
|
||||
static size_t copy_range_info_to_user(struct fanotify_event *event,
|
||||
char __user *buf, int count)
|
||||
{
|
||||
struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
|
||||
struct fanotify_event_info_range info = { };
|
||||
size_t info_len = FANOTIFY_RANGE_INFO_LEN;
|
||||
|
||||
if (WARN_ON_ONCE(info_len > count))
|
||||
return -EFAULT;
|
||||
|
||||
if (WARN_ON_ONCE(!pevent->ppos))
|
||||
return -EINVAL;
|
||||
|
||||
info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
|
||||
info.hdr.len = info_len;
|
||||
info.offset = *(pevent->ppos);
|
||||
info.count = pevent->count;
|
||||
|
||||
if (copy_to_user(buf, &info, info_len))
|
||||
return -EFAULT;
|
||||
|
||||
return info_len;
|
||||
}
|
||||
|
||||
static int copy_info_records_to_user(struct fanotify_event *event,
|
||||
struct fanotify_info *info,
|
||||
unsigned int info_mode, int pidfd,
|
||||
|
@ -642,6 +691,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
|
|||
total_bytes += ret;
|
||||
}
|
||||
|
||||
if (fanotify_event_has_access_range(event)) {
|
||||
ret = copy_range_info_to_user(event, buf, count);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
buf += ret;
|
||||
count -= ret;
|
||||
total_bytes += ret;
|
||||
}
|
||||
|
||||
return total_bytes;
|
||||
}
|
||||
|
||||
|
@ -756,12 +814,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
|
|||
buf += FAN_EVENT_METADATA_LEN;
|
||||
count -= FAN_EVENT_METADATA_LEN;
|
||||
|
||||
if (info_mode) {
|
||||
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
|
||||
buf, count);
|
||||
if (ret < 0)
|
||||
goto out_close_fd;
|
||||
}
|
||||
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
|
||||
buf, count);
|
||||
if (ret < 0)
|
||||
goto out_close_fd;
|
||||
|
||||
if (f)
|
||||
fd_install(fd, f);
|
||||
|
@ -1294,7 +1350,7 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
|
|||
}
|
||||
|
||||
static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
|
||||
unsigned int fan_flags)
|
||||
__u32 mask, unsigned int fan_flags)
|
||||
{
|
||||
/*
|
||||
* Non evictable mark cannot be downgraded to evictable mark.
|
||||
|
@ -1321,6 +1377,11 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
|
|||
fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
|
||||
return -EEXIST;
|
||||
|
||||
/* For now pre-content events are not generated for directories */
|
||||
mask |= fsn_mark->mask;
|
||||
if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
|
||||
return -EEXIST;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1347,7 +1408,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
|
|||
/*
|
||||
* Check if requested mark flags conflict with an existing mark flags.
|
||||
*/
|
||||
ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
|
||||
ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
@ -1409,6 +1470,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
|
|||
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
|
||||
unsigned int class = flags & FANOTIFY_CLASS_BITS;
|
||||
unsigned int internal_flags = 0;
|
||||
struct file *file;
|
||||
|
||||
pr_debug("%s: flags=%x event_f_flags=%x\n",
|
||||
__func__, flags, event_f_flags);
|
||||
|
@ -1477,7 +1539,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
|
|||
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
|
||||
return -EINVAL;
|
||||
|
||||
f_flags = O_RDWR | __FMODE_NONOTIFY;
|
||||
f_flags = O_RDWR;
|
||||
if (flags & FAN_CLOEXEC)
|
||||
f_flags |= O_CLOEXEC;
|
||||
if (flags & FAN_NONBLOCK)
|
||||
|
@ -1555,10 +1617,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
|
|||
goto out_destroy_group;
|
||||
}
|
||||
|
||||
fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
|
||||
fd = get_unused_fd_flags(f_flags);
|
||||
if (fd < 0)
|
||||
goto out_destroy_group;
|
||||
|
||||
file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
|
||||
f_flags, FMODE_NONOTIFY);
|
||||
if (IS_ERR(file)) {
|
||||
put_unused_fd(fd);
|
||||
fd = PTR_ERR(file);
|
||||
goto out_destroy_group;
|
||||
}
|
||||
fd_install(fd, file);
|
||||
return fd;
|
||||
|
||||
out_destroy_group:
|
||||
|
@ -1638,11 +1708,23 @@ static int fanotify_events_supported(struct fsnotify_group *group,
|
|||
unsigned int flags)
|
||||
{
|
||||
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
|
||||
bool is_dir = d_is_dir(path->dentry);
|
||||
/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
|
||||
bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
|
||||
(mask & FAN_RENAME) ||
|
||||
(flags & FAN_MARK_IGNORE);
|
||||
|
||||
/*
|
||||
* Filesystems need to opt-into pre-content evnets (a.k.a HSM)
|
||||
* and they are only supported on regular files and directories.
|
||||
*/
|
||||
if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
|
||||
if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
|
||||
return -EOPNOTSUPP;
|
||||
if (!is_dir && !d_is_reg(path->dentry))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some filesystems such as 'proc' acquire unusual locks when opening
|
||||
* files. For them fanotify permission events have high chances of
|
||||
|
@ -1675,7 +1757,7 @@ static int fanotify_events_supported(struct fsnotify_group *group,
|
|||
* but because we always allowed it, error only when using new APIs.
|
||||
*/
|
||||
if (strict_dir_events && mark_type == FAN_MARK_INODE &&
|
||||
!d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
|
||||
!is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
|
||||
return -ENOTDIR;
|
||||
|
||||
return 0;
|
||||
|
@ -1776,10 +1858,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
|
|||
return -EPERM;
|
||||
|
||||
/*
|
||||
* Permission events require minimum priority FAN_CLASS_CONTENT.
|
||||
* Permission events are not allowed for FAN_CLASS_NOTIF.
|
||||
* Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
|
||||
*/
|
||||
if (mask & FANOTIFY_PERM_EVENTS &&
|
||||
group->priority < FSNOTIFY_PRIO_CONTENT)
|
||||
group->priority == FSNOTIFY_PRIO_NORMAL)
|
||||
return -EINVAL;
|
||||
else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
|
||||
group->priority == FSNOTIFY_PRIO_CONTENT)
|
||||
return -EINVAL;
|
||||
|
||||
if (mask & FAN_FS_ERROR &&
|
||||
|
@ -1814,6 +1900,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
|
|||
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
|
||||
return -EINVAL;
|
||||
|
||||
/* Pre-content events are not currently generated for directories. */
|
||||
if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
|
||||
return -EINVAL;
|
||||
|
||||
if (mark_cmd == FAN_MARK_FLUSH) {
|
||||
if (mark_type == FAN_MARK_MOUNT)
|
||||
fsnotify_clear_vfsmount_marks_by_group(group);
|
||||
|
|
|
@ -193,7 +193,7 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
|
|||
return mask & marks_mask;
|
||||
}
|
||||
|
||||
/* Are there any inode/mount/sb objects that are interested in this event? */
|
||||
/* Are there any inode/mount/sb objects that watch for these events? */
|
||||
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
|
||||
__u32 mask)
|
||||
{
|
||||
|
@ -203,6 +203,24 @@ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
|
|||
return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
|
||||
}
|
||||
|
||||
/* Report pre-content event with optional range info */
|
||||
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
|
||||
size_t count)
|
||||
{
|
||||
struct file_range range;
|
||||
|
||||
/* Report page aligned range only when pos is known */
|
||||
if (!ppos)
|
||||
return fsnotify_path(path, FS_PRE_ACCESS);
|
||||
|
||||
range.path = path;
|
||||
range.pos = PAGE_ALIGN_DOWN(*ppos);
|
||||
range.count = PAGE_ALIGN(*ppos + count) - range.pos;
|
||||
|
||||
return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
|
||||
FSNOTIFY_EVENT_FILE_RANGE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify this dentry's parent about a child's events with child name info
|
||||
* if parent is watching or if inode/sb/mount are interested in events with
|
||||
|
@ -623,11 +641,72 @@ out:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(fsnotify);
|
||||
|
||||
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
|
||||
/*
|
||||
* At open time we check fsnotify_sb_has_priority_watchers() and set the
|
||||
* FMODE_NONOTIFY_ mode bits accordignly.
|
||||
* Later, fsnotify permission hooks do not check if there are permission event
|
||||
* watches, but that there were permission event watches at open time.
|
||||
*/
|
||||
void file_set_fsnotify_mode(struct file *file)
|
||||
{
|
||||
struct dentry *dentry = file->f_path.dentry, *parent;
|
||||
struct super_block *sb = dentry->d_sb;
|
||||
__u32 mnt_mask, p_mask;
|
||||
|
||||
/* Is it a file opened by fanotify? */
|
||||
if (FMODE_FSNOTIFY_NONE(file->f_mode))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Permission events is a super set of pre-content events, so if there
|
||||
* are no permission event watchers, there are also no pre-content event
|
||||
* watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
|
||||
*/
|
||||
if (likely(!fsnotify_sb_has_priority_watchers(sb,
|
||||
FSNOTIFY_PRIO_CONTENT))) {
|
||||
file->f_mode |= FMODE_NONOTIFY_PERM;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are permission event watchers but no pre-content event
|
||||
* watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
|
||||
*/
|
||||
if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
|
||||
likely(!fsnotify_sb_has_priority_watchers(sb,
|
||||
FSNOTIFY_PRIO_PRE_CONTENT))) {
|
||||
file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, there are some pre-content watchers. Check if anybody is
|
||||
* watching for pre-content events on *this* file.
|
||||
*/
|
||||
mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
|
||||
if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
|
||||
FSNOTIFY_PRE_CONTENT_EVENTS)))
|
||||
return;
|
||||
|
||||
/* Is parent watching for pre-content events on this file? */
|
||||
if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
|
||||
parent = dget_parent(dentry);
|
||||
p_mask = fsnotify_inode_watches_children(d_inode(parent));
|
||||
dput(parent);
|
||||
if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
|
||||
return;
|
||||
}
|
||||
/* Nobody watching for pre-content events from this file */
|
||||
file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
|
||||
}
|
||||
#endif
|
||||
|
||||
static __init int fsnotify_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
|
||||
BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
|
||||
|
||||
ret = init_srcu_struct(&fsnotify_mark_srcu);
|
||||
if (ret)
|
||||
|
|
62
fs/open.c
62
fs/open.c
|
@ -81,14 +81,18 @@ long vfs_truncate(const struct path *path, loff_t length)
|
|||
if (!S_ISREG(inode->i_mode))
|
||||
return -EINVAL;
|
||||
|
||||
error = mnt_want_write(path->mnt);
|
||||
if (error)
|
||||
goto out;
|
||||
|
||||
idmap = mnt_idmap(path->mnt);
|
||||
error = inode_permission(idmap, inode, MAY_WRITE);
|
||||
if (error)
|
||||
goto mnt_drop_write_and_out;
|
||||
return error;
|
||||
|
||||
error = fsnotify_truncate_perm(path, length);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = mnt_want_write(path->mnt);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = -EPERM;
|
||||
if (IS_APPEND(inode))
|
||||
|
@ -114,7 +118,7 @@ put_write_and_out:
|
|||
put_write_access(inode);
|
||||
mnt_drop_write_and_out:
|
||||
mnt_drop_write(path->mnt);
|
||||
out:
|
||||
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(vfs_truncate);
|
||||
|
@ -175,11 +179,18 @@ long do_ftruncate(struct file *file, loff_t length, int small)
|
|||
/* Check IS_APPEND on real upper inode */
|
||||
if (IS_APPEND(file_inode(file)))
|
||||
return -EPERM;
|
||||
sb_start_write(inode->i_sb);
|
||||
|
||||
error = security_file_truncate(file);
|
||||
if (!error)
|
||||
error = do_truncate(file_mnt_idmap(file), dentry, length,
|
||||
ATTR_MTIME | ATTR_CTIME, file);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
error = fsnotify_truncate_perm(&file->f_path, length);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
sb_start_write(inode->i_sb);
|
||||
error = do_truncate(file_mnt_idmap(file), dentry, length,
|
||||
ATTR_MTIME | ATTR_CTIME, file);
|
||||
sb_end_write(inode->i_sb);
|
||||
|
||||
return error;
|
||||
|
@ -894,7 +905,7 @@ static int do_dentry_open(struct file *f,
|
|||
f->f_sb_err = file_sample_sb_err(f);
|
||||
|
||||
if (unlikely(f->f_flags & O_PATH)) {
|
||||
f->f_mode = FMODE_PATH | FMODE_OPENED;
|
||||
f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
|
||||
f->f_op = &empty_fops;
|
||||
return 0;
|
||||
}
|
||||
|
@ -922,6 +933,12 @@ static int do_dentry_open(struct file *f,
|
|||
if (error)
|
||||
goto cleanup_all;
|
||||
|
||||
/*
|
||||
* Set FMODE_NONOTIFY_* bits according to existing permission watches.
|
||||
* If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
|
||||
* change anything.
|
||||
*/
|
||||
file_set_fsnotify_mode(f);
|
||||
error = fsnotify_open_perm(f);
|
||||
if (error)
|
||||
goto cleanup_all;
|
||||
|
@ -1098,6 +1115,23 @@ struct file *dentry_open(const struct path *path, int flags,
|
|||
}
|
||||
EXPORT_SYMBOL(dentry_open);
|
||||
|
||||
struct file *dentry_open_nonotify(const struct path *path, int flags,
|
||||
const struct cred *cred)
|
||||
{
|
||||
struct file *f = alloc_empty_file(flags, cred);
|
||||
if (!IS_ERR(f)) {
|
||||
int error;
|
||||
|
||||
f->f_mode |= FMODE_NONOTIFY;
|
||||
error = vfs_open(path, f);
|
||||
if (error) {
|
||||
fput(f);
|
||||
f = ERR_PTR(error);
|
||||
}
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
/**
|
||||
* dentry_create - Create and open a file
|
||||
* @path: path to create
|
||||
|
@ -1195,7 +1229,7 @@ inline struct open_how build_open_how(int flags, umode_t mode)
|
|||
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
|
||||
{
|
||||
u64 flags = how->flags;
|
||||
u64 strip = __FMODE_NONOTIFY | O_CLOEXEC;
|
||||
u64 strip = O_CLOEXEC;
|
||||
int lookup_flags = 0;
|
||||
int acc_mode = ACC_MODE(flags);
|
||||
|
||||
|
@ -1203,9 +1237,7 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
|
|||
"struct open_flags doesn't yet handle flags > 32 bits");
|
||||
|
||||
/*
|
||||
* Strip flags that either shouldn't be set by userspace like
|
||||
* FMODE_NONOTIFY or that aren't relevant in determining struct
|
||||
* open_flags like O_CLOEXEC.
|
||||
* Strip flags that aren't relevant in determining struct open_flags.
|
||||
*/
|
||||
flags &= ~strip;
|
||||
|
||||
|
|
|
@ -1451,6 +1451,9 @@ xfs_dax_read_fault(
|
|||
|
||||
trace_xfs_read_fault(ip, order);
|
||||
|
||||
ret = filemap_fsnotify_fault(vmf);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
|
||||
ret = xfs_dax_fault_locked(vmf, order, false);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
||||
|
@ -1479,6 +1482,16 @@ xfs_write_fault(
|
|||
vm_fault_t ret;
|
||||
|
||||
trace_xfs_write_fault(ip, order);
|
||||
/*
|
||||
* Usually we get here from ->page_mkwrite callback but in case of DAX
|
||||
* we will get here also for ordinary write fault. Handle HSM
|
||||
* notifications for that case.
|
||||
*/
|
||||
if (IS_DAX(inode)) {
|
||||
ret = filemap_fsnotify_fault(vmf);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
}
|
||||
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vmf->vma->vm_file);
|
||||
|
|
|
@ -1756,7 +1756,7 @@ xfs_fs_fill_super(
|
|||
sb->s_time_max = XFS_LEGACY_TIME_MAX;
|
||||
}
|
||||
trace_xfs_inode_timestamp_range(mp, sb->s_time_min, sb->s_time_max);
|
||||
sb->s_iflags |= SB_I_CGROUPWB;
|
||||
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
|
||||
|
||||
set_posix_acl_flag(sb);
|
||||
|
||||
|
|
|
@ -89,6 +89,16 @@
|
|||
#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \
|
||||
FAN_RENAME)
|
||||
|
||||
/* Content events can be used to inspect file content */
|
||||
#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \
|
||||
FAN_ACCESS_PERM)
|
||||
/* Pre-content events can be used to fill file content */
|
||||
#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS)
|
||||
|
||||
/* Events that require a permission response from user */
|
||||
#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \
|
||||
FANOTIFY_PRE_CONTENT_EVENTS)
|
||||
|
||||
/* Events that can be reported with event->fd */
|
||||
#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
|
||||
|
||||
|
@ -104,10 +114,6 @@
|
|||
FANOTIFY_INODE_EVENTS | \
|
||||
FANOTIFY_ERROR_EVENTS)
|
||||
|
||||
/* Events that require a permission response from user */
|
||||
#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
|
||||
FAN_OPEN_EXEC_PERM)
|
||||
|
||||
/* Extra flags that may be reported with event or control handling of events */
|
||||
#define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR)
|
||||
|
||||
|
@ -126,7 +132,9 @@
|
|||
/* These masks check for invalid bits in permission responses. */
|
||||
#define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY)
|
||||
#define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO)
|
||||
#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS)
|
||||
#define FANOTIFY_RESPONSE_VALID_MASK \
|
||||
(FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \
|
||||
(FAN_ERRNO_MASK << FAN_ERRNO_SHIFT))
|
||||
|
||||
/* Do not use these old uapi constants internally */
|
||||
#undef FAN_ALL_CLASS_BITS
|
||||
|
|
|
@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
|||
|
||||
#define FMODE_NOREUSE ((__force fmode_t)(1 << 23))
|
||||
|
||||
/* FMODE_* bit 24 */
|
||||
|
||||
/* File is embedded in backing_file object */
|
||||
#define FMODE_BACKING ((__force fmode_t)(1 << 25))
|
||||
#define FMODE_BACKING ((__force fmode_t)(1 << 24))
|
||||
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26))
|
||||
/*
|
||||
* Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
|
||||
* generated (see below)
|
||||
*/
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25))
|
||||
|
||||
/*
|
||||
* Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
|
||||
* generated (see below)
|
||||
*/
|
||||
#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26))
|
||||
|
||||
/* File is capable of returning -EAGAIN if I/O will block */
|
||||
#define FMODE_NOWAIT ((__force fmode_t)(1 << 27))
|
||||
|
@ -190,6 +197,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
|||
/* File does not contribute to nr_files count */
|
||||
#define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29))
|
||||
|
||||
/*
|
||||
* The two FMODE_NONOTIFY* define which fsnotify events should not be generated
|
||||
* for a file. These are the possible values of (f->f_mode &
|
||||
* FMODE_FSNOTIFY_MASK) and their meaning:
|
||||
*
|
||||
* FMODE_NONOTIFY - suppress all (incl. non-permission) events.
|
||||
* FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
|
||||
* FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
|
||||
*/
|
||||
#define FMODE_FSNOTIFY_MASK \
|
||||
(FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
|
||||
|
||||
#define FMODE_FSNOTIFY_NONE(mode) \
|
||||
((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
|
||||
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
|
||||
#define FMODE_FSNOTIFY_PERM(mode) \
|
||||
((mode & FMODE_FSNOTIFY_MASK) == 0 || \
|
||||
(mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
|
||||
#define FMODE_FSNOTIFY_HSM(mode) \
|
||||
((mode & FMODE_FSNOTIFY_MASK) == 0)
|
||||
#else
|
||||
#define FMODE_FSNOTIFY_PERM(mode) 0
|
||||
#define FMODE_FSNOTIFY_HSM(mode) 0
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Attribute flags. These should be or-ed together to figure out what
|
||||
* has been changed!
|
||||
|
@ -1246,6 +1279,7 @@ extern int send_sigurg(struct file *file);
|
|||
#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
|
||||
#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */
|
||||
#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */
|
||||
#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */
|
||||
|
||||
/* Possible states of 'frozen' field */
|
||||
enum {
|
||||
|
@ -2767,6 +2801,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
|
|||
}
|
||||
struct file *dentry_open(const struct path *path, int flags,
|
||||
const struct cred *creds);
|
||||
struct file *dentry_open_nonotify(const struct path *path, int flags,
|
||||
const struct cred *cred);
|
||||
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
|
||||
const struct cred *cred);
|
||||
struct path *backing_file_user_path(struct file *f);
|
||||
|
@ -3075,6 +3111,28 @@ static inline void allow_write_access(struct file *file)
|
|||
if (file)
|
||||
atomic_inc(&file_inode(file)->i_writecount);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not prevent write to executable file when watched by pre-content events.
|
||||
*
|
||||
* Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
|
||||
* the time of file open and remains constant for entire lifetime of the file,
|
||||
* so if pre-content watches are added post execution or removed before the end
|
||||
* of the execution, it will not cause i_writecount reference leak.
|
||||
*/
|
||||
static inline int exe_file_deny_write_access(struct file *exe_file)
|
||||
{
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
|
||||
return 0;
|
||||
return deny_write_access(exe_file);
|
||||
}
|
||||
static inline void exe_file_allow_write_access(struct file *exe_file)
|
||||
{
|
||||
if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
|
||||
return;
|
||||
allow_write_access(exe_file);
|
||||
}
|
||||
|
||||
static inline bool inode_is_open_for_write(const struct inode *inode)
|
||||
{
|
||||
return atomic_read(&inode->i_writecount) > 0;
|
||||
|
@ -3730,11 +3788,9 @@ struct ctl_table;
|
|||
int __init list_bdev_fs_names(char *buf, size_t size);
|
||||
|
||||
#define __FMODE_EXEC ((__force int) FMODE_EXEC)
|
||||
#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY)
|
||||
|
||||
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
|
||||
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
|
||||
(flag & __FMODE_NONOTIFY)))
|
||||
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))
|
||||
|
||||
static inline bool is_sxid(umode_t mode)
|
||||
{
|
||||
|
|
|
@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
|
|||
fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
|
||||
}
|
||||
|
||||
static inline int fsnotify_path(const struct path *path, __u32 mask)
|
||||
{
|
||||
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
|
||||
}
|
||||
|
||||
static inline int fsnotify_file(struct file *file, __u32 mask)
|
||||
{
|
||||
const struct path *path;
|
||||
|
||||
/*
|
||||
* FMODE_NONOTIFY are fds generated by fanotify itself which should not
|
||||
* generate new events. We also don't want to generate events for
|
||||
* FMODE_PATH fds (involves open & close events) as they are just
|
||||
* handle creation / destruction events and not "real" file events.
|
||||
*/
|
||||
if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH))
|
||||
if (FMODE_FSNOTIFY_NONE(file->f_mode))
|
||||
return 0;
|
||||
|
||||
path = &file->f_path;
|
||||
/* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
|
||||
if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
|
||||
!fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
|
||||
FSNOTIFY_PRIO_CONTENT))
|
||||
return 0;
|
||||
|
||||
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
|
||||
return fsnotify_path(&file->f_path, mask);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
|
||||
|
||||
void file_set_fsnotify_mode(struct file *file);
|
||||
|
||||
/*
|
||||
* fsnotify_file_area_perm - permission hook before access to file range
|
||||
*/
|
||||
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
|
||||
const loff_t *ppos, size_t count)
|
||||
{
|
||||
__u32 fsnotify_mask = FS_ACCESS_PERM;
|
||||
|
||||
/*
|
||||
* filesystem may be modified in the context of permission events
|
||||
* (e.g. by HSM filling a file on access), so sb freeze protection
|
||||
|
@ -147,14 +144,49 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
|
|||
*/
|
||||
lockdep_assert_once(file_write_not_started(file));
|
||||
|
||||
if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
|
||||
return 0;
|
||||
|
||||
if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* read()/write() and other types of access generate pre-content events.
|
||||
*/
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
|
||||
int ret = fsnotify_pre_content(&file->f_path, ppos, count);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!(perm_mask & MAY_READ))
|
||||
return 0;
|
||||
|
||||
return fsnotify_file(file, fsnotify_mask);
|
||||
/*
|
||||
* read() also generates the legacy FS_ACCESS_PERM event, so content
|
||||
* scanners can inspect the content filled by pre-content event.
|
||||
*/
|
||||
return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
|
||||
}
|
||||
|
||||
/*
|
||||
* fsnotify_file_perm - permission hook before file access
|
||||
* fsnotify_truncate_perm - permission hook before file truncate
|
||||
*/
|
||||
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
|
||||
{
|
||||
struct inode *inode = d_inode(path->dentry);
|
||||
|
||||
if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
|
||||
!fsnotify_sb_has_priority_watchers(inode->i_sb,
|
||||
FSNOTIFY_PRIO_PRE_CONTENT))
|
||||
return 0;
|
||||
|
||||
return fsnotify_pre_content(path, &length, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* fsnotify_file_perm - permission hook before file access (unknown range)
|
||||
*/
|
||||
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
|
||||
{
|
||||
|
@ -168,22 +200,34 @@ static inline int fsnotify_open_perm(struct file *file)
|
|||
{
|
||||
int ret;
|
||||
|
||||
if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
|
||||
return 0;
|
||||
|
||||
if (file->f_flags & __FMODE_EXEC) {
|
||||
ret = fsnotify_file(file, FS_OPEN_EXEC_PERM);
|
||||
ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return fsnotify_file(file, FS_OPEN_PERM);
|
||||
return fsnotify_path(&file->f_path, FS_OPEN_PERM);
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void file_set_fsnotify_mode(struct file *file)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
|
||||
const loff_t *ppos, size_t count)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
|
||||
{
|
||||
return 0;
|
||||
|
|
|
@ -55,6 +55,9 @@
|
|||
#define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
|
||||
#define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
|
||||
#define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */
|
||||
/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
|
||||
|
||||
#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */
|
||||
|
||||
/*
|
||||
* Set on inode mark that cares about things that happen to its children.
|
||||
|
@ -77,8 +80,14 @@
|
|||
*/
|
||||
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
|
||||
|
||||
#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
|
||||
FS_OPEN_EXEC_PERM)
|
||||
/* Content events can be used to inspect file content */
|
||||
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
|
||||
FS_ACCESS_PERM)
|
||||
/* Pre-content events can be used to fill file content */
|
||||
#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS)
|
||||
|
||||
#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
|
||||
FSNOTIFY_PRE_CONTENT_EVENTS)
|
||||
|
||||
/*
|
||||
* This is a list of all events that may get sent to a parent that is watching
|
||||
|
@ -285,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
|
|||
/* When calling fsnotify tell it if the data is a path or inode */
|
||||
enum fsnotify_data_type {
|
||||
FSNOTIFY_EVENT_NONE,
|
||||
FSNOTIFY_EVENT_FILE_RANGE,
|
||||
FSNOTIFY_EVENT_PATH,
|
||||
FSNOTIFY_EVENT_INODE,
|
||||
FSNOTIFY_EVENT_DENTRY,
|
||||
|
@ -297,6 +307,17 @@ struct fs_error_report {
|
|||
struct super_block *sb;
|
||||
};
|
||||
|
||||
struct file_range {
|
||||
const struct path *path;
|
||||
loff_t pos;
|
||||
size_t count;
|
||||
};
|
||||
|
||||
static inline const struct path *file_range_path(const struct file_range *range)
|
||||
{
|
||||
return range->path;
|
||||
}
|
||||
|
||||
static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
|
||||
{
|
||||
switch (data_type) {
|
||||
|
@ -306,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
|
|||
return d_inode(data);
|
||||
case FSNOTIFY_EVENT_PATH:
|
||||
return d_inode(((const struct path *)data)->dentry);
|
||||
case FSNOTIFY_EVENT_FILE_RANGE:
|
||||
return d_inode(file_range_path(data)->dentry);
|
||||
case FSNOTIFY_EVENT_ERROR:
|
||||
return ((struct fs_error_report *)data)->inode;
|
||||
default:
|
||||
|
@ -321,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
|
|||
return (struct dentry *)data;
|
||||
case FSNOTIFY_EVENT_PATH:
|
||||
return ((const struct path *)data)->dentry;
|
||||
case FSNOTIFY_EVENT_FILE_RANGE:
|
||||
return file_range_path(data)->dentry;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
@ -332,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data,
|
|||
switch (data_type) {
|
||||
case FSNOTIFY_EVENT_PATH:
|
||||
return data;
|
||||
case FSNOTIFY_EVENT_FILE_RANGE:
|
||||
return file_range_path(data);
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
@ -347,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
|
|||
return ((struct dentry *)data)->d_sb;
|
||||
case FSNOTIFY_EVENT_PATH:
|
||||
return ((const struct path *)data)->dentry->d_sb;
|
||||
case FSNOTIFY_EVENT_FILE_RANGE:
|
||||
return file_range_path(data)->dentry->d_sb;
|
||||
case FSNOTIFY_EVENT_ERROR:
|
||||
return ((struct fs_error_report *) data)->sb;
|
||||
default:
|
||||
|
@ -366,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report(
|
|||
}
|
||||
}
|
||||
|
||||
static inline const struct file_range *fsnotify_data_file_range(
|
||||
const void *data,
|
||||
int data_type)
|
||||
{
|
||||
switch (data_type) {
|
||||
case FSNOTIFY_EVENT_FILE_RANGE:
|
||||
return (struct file_range *)data;
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Index to merged marks iterator array that correlates to a type of watch.
|
||||
* The type of watched object can be deduced from the iterator type, but not
|
||||
|
@ -854,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event)
|
|||
{
|
||||
INIT_LIST_HEAD(&event->list);
|
||||
}
|
||||
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
|
||||
size_t count);
|
||||
|
||||
#else
|
||||
|
||||
static inline int fsnotify_pre_content(const struct path *path,
|
||||
const loff_t *ppos, size_t count)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int fsnotify(__u32 mask, const void *data, int data_type,
|
||||
struct inode *dir, const struct qstr *name,
|
||||
struct inode *inode, u32 cookie)
|
||||
|
|
|
@ -3431,6 +3431,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf);
|
|||
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
|
||||
pgoff_t start_pgoff, pgoff_t end_pgoff);
|
||||
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);
|
||||
extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf);
|
||||
|
||||
extern unsigned long stack_guard_gap;
|
||||
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
/*
|
||||
* FMODE_EXEC is 0x20
|
||||
* FMODE_NONOTIFY is 0x4000000
|
||||
* These cannot be used by userspace O_* until internal and external open
|
||||
* flags are split.
|
||||
* -Eric Paris
|
||||
|
|
|
@ -25,6 +25,9 @@
|
|||
#define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
|
||||
#define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
|
||||
#define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */
|
||||
/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */
|
||||
|
||||
#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */
|
||||
|
||||
#define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */
|
||||
|
||||
|
@ -143,6 +146,7 @@ struct fanotify_event_metadata {
|
|||
#define FAN_EVENT_INFO_TYPE_DFID 3
|
||||
#define FAN_EVENT_INFO_TYPE_PIDFD 4
|
||||
#define FAN_EVENT_INFO_TYPE_ERROR 5
|
||||
#define FAN_EVENT_INFO_TYPE_RANGE 6
|
||||
|
||||
/* Special info types for FAN_RENAME */
|
||||
#define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10
|
||||
|
@ -189,6 +193,13 @@ struct fanotify_event_info_error {
|
|||
__u32 error_count;
|
||||
};
|
||||
|
||||
struct fanotify_event_info_range {
|
||||
struct fanotify_event_info_header hdr;
|
||||
__u32 pad;
|
||||
__u64 offset;
|
||||
__u64 count;
|
||||
};
|
||||
|
||||
/*
|
||||
* User space may need to record additional information about its decision.
|
||||
* The extra information type records what kind of information is included.
|
||||
|
@ -224,6 +235,13 @@ struct fanotify_response_info_audit_rule {
|
|||
/* Legit userspace responses to a _PERM event */
|
||||
#define FAN_ALLOW 0x01
|
||||
#define FAN_DENY 0x02
|
||||
/* errno other than EPERM can specified in upper byte of deny response */
|
||||
#define FAN_ERRNO_BITS 8
|
||||
#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS)
|
||||
#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1)
|
||||
#define FAN_DENY_ERRNO(err) \
|
||||
(FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT))
|
||||
|
||||
#define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */
|
||||
#define FAN_INFO 0x20 /* Bitmask to indicate additional information */
|
||||
|
||||
|
|
|
@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
|
|||
* We depend on the oldmm having properly denied write access to the
|
||||
* exe_file already.
|
||||
*/
|
||||
if (exe_file && deny_write_access(exe_file))
|
||||
pr_warn_once("deny_write_access() failed in %s\n", __func__);
|
||||
if (exe_file && exe_file_deny_write_access(exe_file))
|
||||
pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
@ -1416,13 +1416,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
|
|||
* We expect the caller (i.e., sys_execve) to already denied
|
||||
* write access, so this is unlikely to fail.
|
||||
*/
|
||||
if (unlikely(deny_write_access(new_exe_file)))
|
||||
if (unlikely(exe_file_deny_write_access(new_exe_file)))
|
||||
return -EACCES;
|
||||
get_file(new_exe_file);
|
||||
}
|
||||
rcu_assign_pointer(mm->exe_file, new_exe_file);
|
||||
if (old_exe_file) {
|
||||
allow_write_access(old_exe_file);
|
||||
exe_file_allow_write_access(old_exe_file);
|
||||
fput(old_exe_file);
|
||||
}
|
||||
return 0;
|
||||
|
@ -1463,7 +1463,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
|
|||
return ret;
|
||||
}
|
||||
|
||||
ret = deny_write_access(new_exe_file);
|
||||
ret = exe_file_deny_write_access(new_exe_file);
|
||||
if (ret)
|
||||
return -EACCES;
|
||||
get_file(new_exe_file);
|
||||
|
@ -1475,7 +1475,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
|
|||
mmap_write_unlock(mm);
|
||||
|
||||
if (old_exe_file) {
|
||||
allow_write_access(old_exe_file);
|
||||
exe_file_allow_write_access(old_exe_file);
|
||||
fput(old_exe_file);
|
||||
}
|
||||
return 0;
|
||||
|
|
86
mm/filemap.c
86
mm/filemap.c
|
@ -47,6 +47,7 @@
|
|||
#include <linux/splice.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/fsnotify.h>
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include "internal.h"
|
||||
|
@ -3141,6 +3142,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
|
|||
unsigned long vm_flags = vmf->vma->vm_flags;
|
||||
unsigned int mmap_miss;
|
||||
|
||||
/*
|
||||
* If we have pre-content watches we need to disable readahead to make
|
||||
* sure that we don't populate our mapping with 0 filled pages that we
|
||||
* never emitted an event for.
|
||||
*/
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
|
||||
return fpin;
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
/* Use the readahead code, even if readahead is disabled */
|
||||
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
|
||||
|
@ -3209,6 +3218,10 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
|
|||
struct file *fpin = NULL;
|
||||
unsigned int mmap_miss;
|
||||
|
||||
/* See comment in do_sync_mmap_readahead. */
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
|
||||
return fpin;
|
||||
|
||||
/* If we don't want any read-ahead, don't bother */
|
||||
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
|
||||
return fpin;
|
||||
|
@ -3267,6 +3280,48 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* filemap_fsnotify_fault - maybe emit a pre-content event.
|
||||
* @vmf: struct vm_fault containing details of the fault.
|
||||
*
|
||||
* If we have a pre-content watch on this file we will emit an event for this
|
||||
* range. If we return anything the fault caller should return immediately, we
|
||||
* will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
|
||||
* fault again and then the fault handler will run the second time through.
|
||||
*
|
||||
* Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
|
||||
*/
|
||||
vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
|
||||
{
|
||||
struct file *fpin = NULL;
|
||||
int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
|
||||
loff_t pos = vmf->pgoff >> PAGE_SHIFT;
|
||||
size_t count = PAGE_SIZE;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* We already did this and now we're retrying with everything locked,
|
||||
* don't emit the event and continue.
|
||||
*/
|
||||
if (vmf->flags & FAULT_FLAG_TRIED)
|
||||
return 0;
|
||||
|
||||
/* No watches, we're done. */
|
||||
if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
|
||||
return 0;
|
||||
|
||||
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
|
||||
if (!fpin)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
err = fsnotify_file_area_perm(fpin, mask, &pos, count);
|
||||
fput(fpin);
|
||||
if (err)
|
||||
return VM_FAULT_SIGBUS;
|
||||
return VM_FAULT_RETRY;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
|
||||
|
||||
/**
|
||||
* filemap_fault - read in file data for page fault handling
|
||||
* @vmf: struct vm_fault containing details of the fault
|
||||
|
@ -3370,6 +3425,37 @@ retry_find:
|
|||
* or because readahead was otherwise unable to retrieve it.
|
||||
*/
|
||||
if (unlikely(!folio_test_uptodate(folio))) {
|
||||
/*
|
||||
* If this is a precontent file we have can now emit an event to
|
||||
* try and populate the folio.
|
||||
*/
|
||||
if (!(vmf->flags & FAULT_FLAG_TRIED) &&
|
||||
unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
|
||||
loff_t pos = folio_pos(folio);
|
||||
size_t count = folio_size(folio);
|
||||
|
||||
/* We're NOWAIT, we have to retry. */
|
||||
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
|
||||
folio_unlock(folio);
|
||||
goto out_retry;
|
||||
}
|
||||
|
||||
if (mapping_locked)
|
||||
filemap_invalidate_unlock_shared(mapping);
|
||||
mapping_locked = false;
|
||||
|
||||
folio_unlock(folio);
|
||||
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
|
||||
if (!fpin)
|
||||
goto out_retry;
|
||||
|
||||
error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
|
||||
count);
|
||||
if (error)
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out_retry;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the invalidate lock is not held, the folio was in cache
|
||||
* and uptodate and now it is not. Strange but possible since we
|
||||
|
|
19
mm/memory.c
19
mm/memory.c
|
@ -76,6 +76,7 @@
|
|||
#include <linux/ptrace.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/sched/sysctl.h>
|
||||
#include <linux/fsnotify.h>
|
||||
|
||||
#include <trace/events/kmem.h>
|
||||
|
||||
|
@ -5662,8 +5663,17 @@ out_map:
|
|||
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
|
||||
{
|
||||
struct vm_area_struct *vma = vmf->vma;
|
||||
|
||||
if (vma_is_anonymous(vma))
|
||||
return do_huge_pmd_anonymous_page(vmf);
|
||||
/*
|
||||
* Currently we just emit PAGE_SIZE for our fault events, so don't allow
|
||||
* a huge fault if we have a pre content watch on this file. This would
|
||||
* be trivial to support, but there would need to be tests to ensure
|
||||
* this works properly and those don't exist currently.
|
||||
*/
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
|
||||
return VM_FAULT_FALLBACK;
|
||||
if (vma->vm_ops->huge_fault)
|
||||
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
|
||||
return VM_FAULT_FALLBACK;
|
||||
|
@ -5687,6 +5697,9 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
|
|||
}
|
||||
|
||||
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
|
||||
/* See comment in create_huge_pmd. */
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
|
||||
goto split;
|
||||
if (vma->vm_ops->huge_fault) {
|
||||
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
|
@ -5709,6 +5722,9 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
|
|||
/* No support for anonymous transparent PUD pages yet */
|
||||
if (vma_is_anonymous(vma))
|
||||
return VM_FAULT_FALLBACK;
|
||||
/* See comment in create_huge_pmd. */
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
|
||||
return VM_FAULT_FALLBACK;
|
||||
if (vma->vm_ops->huge_fault)
|
||||
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
@ -5726,6 +5742,9 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
|
|||
if (vma_is_anonymous(vma))
|
||||
goto split;
|
||||
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
|
||||
/* See comment in create_huge_pmd. */
|
||||
if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
|
||||
goto split;
|
||||
if (vma->vm_ops->huge_fault) {
|
||||
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
|
|
|
@ -1613,6 +1613,13 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
|
|||
}
|
||||
EXPORT_SYMBOL(remap_vmalloc_range);
|
||||
|
||||
vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
|
||||
|
||||
vm_fault_t filemap_fault(struct vm_fault *vmf)
|
||||
{
|
||||
BUG();
|
||||
|
|
|
@ -128,6 +128,7 @@
|
|||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/fadvise.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/fsnotify.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
|
@ -548,6 +549,15 @@ void page_cache_sync_ra(struct readahead_control *ractl,
|
|||
unsigned long max_pages, contig_count;
|
||||
pgoff_t prev_index, miss;
|
||||
|
||||
/*
|
||||
* If we have pre-content watches we need to disable readahead to make
|
||||
* sure that we don't find 0 filled pages in cache that we never emitted
|
||||
* events for. Filesystems supporting HSM must make sure to not call
|
||||
* this function with ractl->file unset for files handled by HSM.
|
||||
*/
|
||||
if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Even if readahead is disabled, issue this request as readahead
|
||||
* as we'll need it to satisfy the requested range. The forced
|
||||
|
@ -626,6 +636,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
|
|||
if (!ra->ra_pages)
|
||||
return;
|
||||
|
||||
/* See the comment in page_cache_sync_ra. */
|
||||
if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Same bit is used for PG_readahead and PG_reclaim.
|
||||
*/
|
||||
|
|
|
@ -3404,7 +3404,8 @@ static int selinux_path_notify(const struct path *path, u64 mask,
|
|||
perm |= FILE__WATCH_WITH_PERM;
|
||||
|
||||
/* watches on read-like events need the file:watch_reads permission */
|
||||
if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_CLOSE_NOWRITE))
|
||||
if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
|
||||
FS_CLOSE_NOWRITE))
|
||||
perm |= FILE__WATCH_READS;
|
||||
|
||||
return path_has_perm(current_cred(), path, perm);
|
||||
|
|
Loading…
Add table
Reference in a new issue