int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
- int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
};
Again, all methods are called without any locks being held, unless
writing out the whole address_space.
The Writeback tag is used by filemap*wait* and sync_page* functions,
-via filemap_fdatawait_range, to wait for all writeback to
-complete. While waiting ->sync_page (if defined) will be called on
-each page that is found to require writeback.
+via filemap_fdatawait_range, to wait for all writeback to complete.
An address_space handler may attach extra information to a page,
typically using the 'private' field in the 'struct page'. If such
The read process essentially only requires 'readpage'. The write
process is more complicated and uses write_begin/write_end or
-set_page_dirty to write data into the address_space, and writepage,
-sync_page, and writepages to writeback data to storage.
+set_page_dirty to write data into the address_space, and writepage
+and writepages to writeback data to storage.
Adding and removing pages to/from an address_space is protected by the
inode's i_mutex.
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
+ /* isolate a page for migration */
+ bool (*isolate_page) (struct page *, isolate_mode_t);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
+ /* put migration-failed page back to right list */
+ void (*putback_page) (struct page *);
int (*launder_page) (struct page *);
+
int (*is_partially_uptodate) (struct page *, unsigned long,
unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
but instead uses bmap to find out where the blocks in the file
are and uses those addresses directly.
- dentry_open: *WARNING: probably going away soon, do not use!* This is an
- alternative to f_op->open(), the difference is that this method may open
- a file not necessarily originating from the same filesystem as the one
- i_op->open() was called on. It may be useful for stacking filesystems
- which want to allow native I/O directly on underlying files.
-
-
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
from the address space. This generally corresponds to either a
The second case is when a request has been made to invalidate
some or all pages in an address_space. This can happen
- through the fadvice(POSIX_FADV_DONTNEED) system call or by the
+ through the fadvise(POSIX_FADV_DONTNEED) system call or by the
filesystem explicitly requesting it as nfs and 9fs do (when
they believe the cache may be out of date with storage) by
calling invalidate_inode_pages2().
and transfer data directly between the storage and the
application's address space.
+ isolate_page: Called by the VM when isolating a movable non-lru page.
+ If page is successfully isolated, VM marks the page as PG_isolated
+ via __SetPageIsolated.
+
migrate_page: This is used to compact the physical memory usage.
If the VM wants to relocate a page (maybe off a memory card
that is signalling imminent failure) it will pass a new page
transfer any private data across and update any references
that it has to the page.
+ putback_page: Called by the VM when isolated page's migration fails.
+
launder_page: Called before freeing a page - it writes back the dirty page. To
prevent redirtying the page, it is kept locked during the whole
operation.
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
- int (*d_compare)(const struct dentry *, const struct dentry *,
+ int (*d_compare)(const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
+ int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool);
+ struct dentry *(*d_real)(struct dentry *, const struct inode *,
+ unsigned int);
};
d_revalidate: called when the VFS needs to revalidate a dentry. This
always cache a reachable dentry. d_delete must be constant and
idempotent.
+ d_init: called when a dentry is allocated
+
d_release: called when a dentry is really deallocated
d_iput: called when a dentry loses its inode (just prior to its
at the end of the buffer, and returns a pointer to the first char.
dynamic_dname() helper function is provided to take care of this.
+ Example :
+
+ static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
+ {
+ return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
+ dentry->d_inode->i_ino);
+ }
+
d_automount: called when an automount dentry is to be traversed (optional).
This should create a new VFS mount record and return the record to the
caller. The caller is supplied with a path parameter giving the
This function is only used if DCACHE_MANAGE_TRANSIT is set on the
dentry being transited from.
-Example :
+ d_real: overlay/union type filesystems implement this method to return one of
+ the underlying dentries hidden by the overlay. It is used in three
+ different modes:
-static char *pipefs_dname(struct dentry *dent, char *buffer, int buflen)
-{
- return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
- dentry->d_inode->i_ino);
-}
+ Called from open it may need to copy-up the file depending on the
+ supplied open flags. This mode is selected with a non-zero flags
+ argument. In this mode the d_real method can return an error.
+
+ Called from file_dentry() it returns the real dentry matching the inode
+ argument. The real dentry may be from a lower layer already copied up,
+ but still referenced from the file. This mode is selected with a
+ non-NULL inode argument. This will always succeed.
+
+ With NULL inode and zero flags the topmost real underlying dentry is
+ returned. This will always succeed.
+
+ This method is never called with both non-NULL inode and non-zero flags.
Each dentry has a pointer to its parent dentry, as well as a hash list
of child dentries. Child dentries are basically like files in a
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
LOCKDEP_SYS_EXIT
jne opportunistic_sysret_failed
/*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+ * restore RF properly. If the slowpath sets it for whatever reason, we
+ * need to restore it correctly.
+ *
+ * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+ * trap from userspace immediately after SYSRET. This would cause an
+ * infinite loop whenever #DB happens with register state that satisfies
+ * the opportunistic SYSRET conditions. For example, single-stepping
+ * this user code:
*
* movq $stuck_here, %rcx
* pushfq
jmp entry_SYSCALL64_slow_path
1:
- /* Called from C */
- jmp *%rax /* called from C */
+ jmp *%rax /* Called from C */
END(stub_ptregs_64)
.macro ptregs_stub func
#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
#include <asm/syscalls_64.h>
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
/*
* A newly forked process directly context switches into this address.
*
- * rdi: prev task we switched from
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
*/
ENTRY(ret_from_fork)
- LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+ movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
- testb $3, CS(%rsp) /* from kernel_thread? */
- jnz 1f
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
- /*
- * We came from kernel_thread. This code path is quite twisted, and
- * someone should clean it up.
- *
- * copy_thread_tls stashes the function pointer in RBX and the
- * parameter to be passed in RBP. The called function is permitted
- * to call do_execve and thereby jump to user mode.
- */
- movq RBP(%rsp), %rdi
- call *RBX(%rsp)
- movl $0, RAX(%rsp)
-
- /*
- * Fall through as though we're exiting a syscall. This makes a
- * twisted sort of sense if we just called do_execve.
- */
-
-1:
+2:
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
jmp restore_regs_and_iret
+
+1:
+ /* kernel thread */
+ movq %r12, %rdi
+ call *%rbx
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
+ */
+ movq $0, RAX(%rsp)
+ jmp 2b
END(ret_from_fork)
/*
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
- pushq %rax
- pushq %rdi
+ /*
+ * We are running with user GSBASE. All GPRs contain their user
+ * values. We have a percpu ESPFIX stack that is eight slots
+ * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
+ * of the ESPFIX stack.
+ *
+ * We clobber RAX and RDI in this code. We stash RDI on the
+ * normal stack and RAX on the ESPFIX stack.
+ *
+ * The ESPFIX stack layout we set up looks like this:
+ *
+ * --- top of ESPFIX stack ---
+ * SS
+ * RSP
+ * RFLAGS
+ * CS
+ * RIP <-- RSP points here when we're done
+ * RAX <-- espfix_waddr points here
+ * --- bottom of ESPFIX stack ---
+ */
+
+ pushq %rdi /* Stash user RDI */
SWAPGS
movq PER_CPU_VAR(espfix_waddr), %rdi
- movq %rax, (0*8)(%rdi) /* RAX */
- movq (2*8)(%rsp), %rax /* RIP */
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
movq %rax, (1*8)(%rdi)
- movq (3*8)(%rsp), %rax /* CS */
+ movq (2*8)(%rsp), %rax /* user CS */
movq %rax, (2*8)(%rdi)
- movq (4*8)(%rsp), %rax /* RFLAGS */
+ movq (3*8)(%rsp), %rax /* user RFLAGS */
movq %rax, (3*8)(%rdi)
- movq (6*8)(%rsp), %rax /* SS */
+ movq (5*8)(%rsp), %rax /* user SS */
movq %rax, (5*8)(%rdi)
- movq (5*8)(%rsp), %rax /* RSP */
+ movq (4*8)(%rsp), %rax /* user RSP */
movq %rax, (4*8)(%rdi)
- andl $0xffff0000, %eax
- popq %rdi
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+ popq %rdi /* Restore user RDI */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+ * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+ * espfix_waddr for any X. That is, there are 65536 RO aliases of
+ * the same page. Set up RSP so that RSP[31:16] contains the
+ * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+ * still points to an RO alias of the ESPFIX stack.
+ */
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
- popq %rax
+
+ /*
+ * At this point, we cannot write to the stack any more, but we can
+ * still read.
+ */
+ popq %rax /* Restore user RAX */
+
+ /*
+ * RSP now points to an ordinary IRET frame, except that the page
+ * is read-only and RSP[31:16] are preloaded with the userspace
+ * values. We can now IRET back to userspace.
+ */
jmp native_irq_return_iret
#endif
END(common_interrupt)
.endm
#endif
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY .popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
.macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
apicinterrupt3 \num \sym \do_sym
trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
.endm
#ifdef CONFIG_SMP
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
-.Lerror_entry_from_usermode_swapgs:
/*
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
- jmp .Lerror_entry_from_usermode_swapgs
+ SWAPGS
+ jmp .Lerror_entry_done
.Lbstep_iret:
/* Fix truncated RIP */
/*
- * On entry, EBS is a "return to kernel mode" flag:
+ * On entry, EBX is a "return to kernel mode" flag:
* 1: already in kernel mode, don't need SWAPGS
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+
+ENTRY(rewind_stack_do_exit)
+ /* Prevent any naive code from trying to unwind to our caller. */
+ xorl %ebp, %ebp
+
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp
+
+ call do_exit
+1: jmp 1b
+END(rewind_stack_do_exit)
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
select DEBUG_FS
select STACKTRACE
+ select STACKDEPOT
select PAGE_EXTENSION
help
This keeps track of what call chain is the owner of a page, may
config DEBUG_FS
bool "Debug Filesystem"
+ select SRCU
help
debugfs is a virtual file system that kernel developers use to put
debugging files into. Enable this option to be able to read and
a larger kernel).
- Run the section mismatch analysis for each module/built-in.o file.
When we run the section mismatch analysis on vmlinux.o, we
- lose valueble information about where the mismatch was
+ lose valuable information about where the mismatch was
introduced.
Running the analysis for each module/built-in.o file
tells where the mismatch happens much closer to the
bool "Code coverage for fuzzing"
depends on ARCH_HAS_KCOV
select DEBUG_FS
+ select GCC_PLUGINS if !COMPILE_TEST
+ select GCC_PLUGIN_SANCOV if !COMPILE_TEST
help
KCOV exposes kernel code coverage information in a form suitable
for coverage-guided fuzzing (randomized testing).
For more details, see Documentation/kcov.txt.
+config KCOV_INSTRUMENT_ALL
+ bool "Instrument all code by default"
+ depends on KCOV
+ default y if KCOV
+ help
+ If you are doing generic system call fuzzing (like e.g. syzkaller),
+ then you will want to instrument the whole kernel and you should
+ say y here. If you are doing more targeted fuzzing (like e.g.
+ filesystem fuzzing with AFL) then you will want to enable coverage
+ for more specific subsets of files, and should say n here.
+
config DEBUG_SHIRQ
bool "Debug shared IRQ handlers"
depends on DEBUG_KERNEL
help
Say Y here to enable the kernel to detect "hung tasks",
which are bugs that cause the task to be stuck in
- uninterruptible "D" state indefinitiley.
+ uninterruptible "D" state indefinitely.
When a hung task is detected, the kernel will print the
current stack trace (which you should report), but the
Say M if you want the RCU performance tests to build as a module.
Say N if you are unsure.
-config RCU_PERF_TEST_RUNNABLE
- bool "performance tests for RCU runnable by default"
- depends on RCU_PERF_TEST = y
- default n
- help
- This option provides a way to build the RCU performance tests
- directly into the kernel without them starting up at boot time.
- You can use /sys/module to manually override this setting.
- This /proc file is available only when the RCU performance
- tests have been built into the kernel.
-
- Say Y here if you want the RCU performance tests to start during
- boot (you probably don't).
- Say N here if you want the RCU performance tests to start only
- after being manually enabled via /sys/module.
-
config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
-config RCU_TORTURE_TEST_RUNNABLE
- bool "torture tests for RCU runnable by default"
- depends on RCU_TORTURE_TEST = y
- default n
- help
- This option provides a way to build the RCU torture tests
- directly into the kernel without them starting up at boot
- time. You can use /proc/sys/kernel/rcutorture_runnable
- to manually override this setting. This /proc file is
- available only when the RCU torture tests have been built
- into the kernel.
-
- Say Y here if you want the RCU torture tests to start during
- boot (you probably don't).
- Say N here if you want the RCU torture tests to start only
- after being manually enabled via /proc.
-
config RCU_TORTURE_TEST_SLOW_PREINIT
bool "Slow down RCU grace-period pre-initialization to expose races"
depends on RCU_TORTURE_TEST
Enable this option if you want to use the LatencyTOP tool
to find out which userspace is blocking on what kernel operations.
-config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
- bool
-
-config DEBUG_STRICT_USER_COPY_CHECKS
- bool "Strict user copy size checks"
- depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
- depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
- help
- Enabling this option turns a certain set of sanity checks for user
- copy operations into compile time failures.
-
- The copy_from_user() etc checks are there to help test if there
- are sufficient security checks on the length argument of
- the copy operation, by having gcc prove that the argument is
- within bounds.
-
- If unsure, say N.
-
source kernel/trace/Kconfig
menu "Runtime Testing"
If unsure, say N.
+config TEST_UUID
+ tristate "Test functions located in the uuid module at runtime"
+
config TEST_RHASHTABLE
tristate "Perform selftest on resizable hash table"
default n
If unsure, say N.
+config TEST_HASH
+ tristate "Perform selftest on hash functions"
+ default n
+ help
+ Enable this option to test the kernel's integer (<linux/hash,h>)
+ and string (<linux/stringhash.h>) hash functions on boot
+ (or module load).
+
+ This is intended to help people writing architecture-specific
+ optimized versions. If unsure, say N.
+
endmenu # runtime tests
config PROVIDE_OHCI1394_DMA_INIT
* This function returns a pointer to a dentry if it succeeds. This
* pointer must be passed to the securityfs_remove() function when the file is
* to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here). If an error occurs, %NULL will be returned.
+ * you are responsible here). If an error occurs, the function will return
+ * the error value (via ERR_PTR).
*
* If securityfs is not enabled in the kernel, the value %-ENODEV is
- * returned. It is not wise to check for this value, but rather, check for
- * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
- * code.
+ * returned.
*/
struct dentry *securityfs_create_dir(const char *name, struct dentry *parent)
{
*/
void securityfs_remove(struct dentry *dentry)
{
- struct dentry *parent;
+ struct inode *dir;
if (!dentry || IS_ERR(dentry))
return;
- parent = dentry->d_parent;
- if (!parent || d_really_is_negative(parent))
- return;
-
- inode_lock(d_inode(parent));
+ dir = d_inode(dentry->d_parent);
+ inode_lock(dir);
if (simple_positive(dentry)) {
if (d_is_dir(dentry))
- simple_rmdir(d_inode(parent), dentry);
+ simple_rmdir(dir, dentry);
else
- simple_unlink(d_inode(parent), dentry);
+ simple_unlink(dir, dentry);
dput(dentry);
}
- inode_unlock(d_inode(parent));
+ inode_unlock(dir);
simple_release_fs(&mount, &mount_count);
}
EXPORT_SYMBOL_GPL(securityfs_remove);