The following commit has been merged in the master branch: commit f62988488c64c44e8808735cb1d0347861866cd5 Merge: 3c359c787c6c456bbcc33c48bd7a5e505ab4239e 3e6743e28b9b43d37ced234bdf8e19955d0216f8 Author: Stephen Rothwell sfr@canb.auug.org.au Date: Tue Dec 13 14:33:00 2022 +1100
next-20221208/random
diff --combined Documentation/admin-guide/kernel-parameters.txt index 8656fbb45a39,78493797460f..6cfa6e3996cf --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@@ -703,17 -703,6 +703,17 @@@ condev= [HW,S390] console device conmode=
+ con3215_drop= [S390] 3215 console drop mode. + Format: y|n|Y|N|1|0 + When set to true, drop data on the 3215 console when + the console buffer is full. In this case the + operator using a 3270 terminal emulator (for example + x3270) does not have to enter the clear key for the + console output to advance and the kernel to continue. + This leads to a much faster boot time when a 3270 + terminal emulator is active. If no 3270 terminal + emulator is used, this parameter has no effect. + console= [KNL] Output console device and options.
tty<n> Use the virtual console device <n>. @@@ -842,7 -831,7 +842,7 @@@ memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset is selected automatically. - [KNL, X86-64] Select a region under 4G first, and + [KNL, X86-64, ARM64] Select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. See Documentation/admin-guide/kdump/kdump.rst for further details. @@@ -862,23 -851,26 +862,23 @@@ available. It will be ignored if crashkernel=X is specified. crashkernel=size[KMG],low - [KNL, X86-64] range under 4G. When crashkernel=X,high + [KNL, X86-64, ARM64] range under 4G. When crashkernel=X,high is passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory, also enough extra low memory is needed to make sure DMA buffers for 32-bit devices won't run out. Kernel would try to allocate - at least 256M below 4G automatically. + default size of memory below 4G automatically. The default + size is platform dependent. + --> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB) + --> arm64: 128MiB This one lets the user specify own low range under 4G for second kernel instead. 0: to disable low allocation. It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G.
- [KNL, ARM64] range in low memory. - This one lets the user specify a low range in the - DMA zone for the crash dump kernel. - It will be ignored when crashkernel=X,high is not used - or memory reserved is located in the DMA zones. - cryptomgr.notests [KNL] Disable crypto self-tests
@@@ -1050,11 -1042,6 +1050,11 @@@ them frequently to increase the rate of SLB faults on kernel addresses.
+ stress_hpt [PPC] + Limits the number of kernel HPT entries in the hash + page table to increase the rate of hash page table + faults on kernel addresses. + disable= [IPV6] See Documentation/networking/ipv6.rst.
@@@ -2313,13 -2300,7 +2313,13 @@@ Provide an override to the IOAPIC-ID<->DEVICE-ID mapping provided in the IVRS ACPI table. By default, PCI segment is 0, and can be omitted. - For example: + + For example, to map IOAPIC-ID decimal 10 to + PCI segment 0x1 and PCI device 00:14.0, + write the parameter as: + ivrs_ioapic=10@0001:00:14.0 + + Deprecated formats: * To map IOAPIC-ID decimal 10 to PCI device 00:14.0 write the parameter as: ivrs_ioapic[10]=00:14.0 @@@ -2331,13 -2312,7 +2331,13 @@@ Provide an override to the HPET-ID<->DEVICE-ID mapping provided in the IVRS ACPI table. By default, PCI segment is 0, and can be omitted. - For example: + + For example, to map HPET-ID decimal 10 to + PCI segment 0x1 and PCI device 00:14.0, + write the parameter as: + ivrs_hpet=10@0001:00:14.0 + + Deprecated formats: * To map HPET-ID decimal 0 to PCI device 00:14.0 write the parameter as: ivrs_hpet[0]=00:14.0 @@@ -2348,20 -2323,15 +2348,20 @@@ ivrs_acpihid [HW,X86-64] Provide an override to the ACPI-HID:UID<->DEVICE-ID mapping provided in the IVRS ACPI table. + By default, PCI segment is 0, and can be omitted.
For example, to map UART-HID:UID AMD0020:0 to PCI segment 0x1 and PCI device ID 00:14.5, write the parameter as: - ivrs_acpihid[0001:00:14.5]=AMD0020:0 + ivrs_acpihid=AMD0020:0@0001:00:14.5
- By default, PCI segment is 0, and can be omitted. - For example, PCI device 00:14.5 write the parameter as: + Deprecated formats: + * To map UART-HID:UID AMD0020:0 to PCI segment is 0, + PCI device ID 00:14.5, write the parameter as: ivrs_acpihid[00:14.5]=AMD0020:0 + * To map UART-HID:UID AMD0020:0 to PCI segment 0x1 and + PCI device ID 00:14.5, write the parameter as: + ivrs_acpihid[0001:00:14.5]=AMD0020:0
js= [HW,JOY] Analog joystick See Documentation/input/joydev/joystick.rst. @@@ -3807,15 -3777,12 +3807,15 @@@ shutdown the other cpus. Instead use the REBOOT_VECTOR irq.
- nomodeset Disable kernel modesetting. DRM drivers will not perform - display-mode changes or accelerated rendering. Only the - system framebuffer will be available for use if this was - set-up by the firmware or boot loader. + nomodeset Disable kernel modesetting. Most systems' firmware + sets up a display mode and provides framebuffer memory + for output. With nomodeset, DRM and fbdev drivers will + not load if they could possibly displace the pre- + initialized output. Only the system framebuffer will + be available for use. The respective drivers will not + perform display-mode changes or accelerated rendering.
- Useful as fallback, or for testing and debugging. + Useful as error fallback, or for testing and debugging.
nomodule Disable module load
@@@ -4599,17 -4566,15 +4599,15 @@@
ramdisk_start= [RAM] RAM disk image start address
- random.trust_cpu={on,off} - [KNL] Enable or disable trusting the use of the - CPU's random number generator (if available) to - fully seed the kernel's CRNG. Default is controlled - by CONFIG_RANDOM_TRUST_CPU. - - random.trust_bootloader={on,off} - [KNL] Enable or disable trusting the use of a - seed passed by the bootloader (if available) to - fully seed the kernel's CRNG. Default is controlled - by CONFIG_RANDOM_TRUST_BOOTLOADER. + random.trust_cpu=off + [KNL] Disable trusting the use of the CPU's + random number generator (if available) to + initialize the kernel's RNG. + + random.trust_bootloader=off + [KNL] Disable trusting the use of the a seed + passed by the bootloader (if available) to + initialize the kernel's RNG.
randomize_kstack_offset= [KNL] Enable or disable kernel stack offset @@@ -6290,25 -6255,6 +6288,25 @@@ See also Documentation/trace/ftrace.rst "trace options" section.
+ trace_trigger=[trigger-list] + [FTRACE] Add a event trigger on specific events. + Set a trigger on top of a specific event, with an optional + filter. + + The format is is "trace_trigger=<event>.<trigger>[ if <filter>],..." + Where more than one trigger may be specified that are comma deliminated. + + For example: + + trace_trigger="sched_switch.stacktrace if prev_state == 2" + + The above will enable the "stacktrace" trigger on the "sched_switch" + event but only trigger it if the "prev_state" of the "sched_switch" + event is "2" (TASK_UNINTERUPTIBLE). + + See also "Event triggers" in Documentation/trace/events.rst + + traceoff_on_warning [FTRACE] enable this option to disable tracing when a warning is hit. This turns off "tracing_on". Tracing can @@@ -7011,14 -6957,3 +7009,14 @@@ memory, and other data can't be written using xmon commands. off xmon is disabled. + + amd_pstate= [X86] + disable + Do not enable amd_pstate as the default + scaling driver for the supported processors + passive + Use amd_pstate as a scaling driver, driver requests a + desired performance on this abstract scale and the power + management firmware translates the requests into actual + hardware states (core frequency, data fabric and memory + clocks etc.) diff --combined arch/arm64/kernel/process.c index 19cd05eea3f0,1395a1638427..269ac1c25ae2 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@@ -331,8 -331,6 +331,8 @@@ int arch_dup_task_struct(struct task_st clear_tsk_thread_flag(dst, TIF_SME); }
+ dst->thread.fp_type = FP_STATE_FPSIMD; + /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT);
@@@ -593,7 -591,7 +593,7 @@@ unsigned long __get_wchan(struct task_s unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(PAGE_SIZE); + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; }
diff --combined arch/loongarch/kernel/process.c index 790cc14c5f06,77952d814bad..c583b1ef1f44 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@@ -47,12 -47,6 +47,12 @@@ #include <asm/unwind.h> #include <asm/vdso.h>
+#ifdef CONFIG_STACKPROTECTOR +#include <linux/stackprotector.h> +unsigned long __stack_chk_guard __read_mostly; +EXPORT_SYMBOL(__stack_chk_guard); +#endif + /* * Idle related variables and functions */ @@@ -158,7 -152,7 +158,7 @@@ int copy_thread(struct task_struct *p, childregs->csr_crmd = p->thread.csr_crmd; childregs->csr_prmd = p->thread.csr_prmd; childregs->csr_ecfg = p->thread.csr_ecfg; - return 0; + goto out; }
/* user thread */ @@@ -177,15 -171,14 +177,15 @@@ */ childregs->csr_euen = 0;
+ if (clone_flags & CLONE_SETTLS) + childregs->regs[2] = tls; + +out: clear_tsk_thread_flag(p, TIF_USEDFPU); clear_tsk_thread_flag(p, TIF_USEDSIMD); clear_tsk_thread_flag(p, TIF_LSX_CTX_LIVE); clear_tsk_thread_flag(p, TIF_LASX_CTX_LIVE);
- if (clone_flags & CLONE_SETTLS) - childregs->regs[2] = tls; - return 0; }
@@@ -300,7 -293,7 +300,7 @@@ unsigned long stack_top(void unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(PAGE_SIZE); + sp -= get_random_u32_below(PAGE_SIZE);
return sp & STACK_ALIGN; } diff --combined arch/powerpc/kernel/process.c index edb46d0806ef,fcf604370c66..c22cc234672f --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@@ -862,8 -862,10 +862,8 @@@ static inline int set_breakpoint_8xx(st return 0; }
-void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) +static void set_hw_breakpoint(int nr, struct arch_hw_breakpoint *brk) { - memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); - if (dawr_enabled()) // Power8 or later set_dawr(nr, brk); @@@ -877,12 -879,6 +877,12 @@@ WARN_ON_ONCE(1); }
+void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ + memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); + set_hw_breakpoint(nr, brk); +} + /* Check if we have DAWR or DABR hardware */ bool ppc_breakpoint_available(void) { @@@ -895,34 -891,6 +895,34 @@@ } EXPORT_SYMBOL_GPL(ppc_breakpoint_available);
+/* Disable the breakpoint in hardware without touching current_brk[] */ +void suspend_breakpoints(void) +{ + struct arch_hw_breakpoint brk = {0}; + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, &brk); +} + +/* + * Re-enable breakpoints suspended by suspend_breakpoints() in hardware + * from current_brk[] + */ +void restore_breakpoints(void) +{ + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, this_cpu_ptr(¤t_brk[i])); +} + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline bool tm_enabled(struct task_struct *tsk) @@@ -1391,7 -1359,7 +1391,7 @@@ static void show_instructions(struct pt unsigned long nip = regs->nip; unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int));
- printk("Instruction dump:"); + printk("Code: ");
/* * If we were executing with the MMU off for instructions, adjust pc @@@ -1405,6 -1373,9 +1405,6 @@@ for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr;
- if (!(i % 8)) - pr_cont("\n"); - if (!__kernel_text_address(pc) || get_kernel_nofault(instr, (const void *)pc)) { pr_cont("XXXXXXXX "); @@@ -1755,17 -1726,13 +1755,17 @@@ int copy_thread(struct task_struct *p,
klp_init_thread_info(p);
+ /* Create initial stack frame. */ + sp -= STACK_USER_INT_FRAME_SIZE; + *(unsigned long *)(sp + STACK_INT_FRAME_MARKER) = STACK_FRAME_REGS_MARKER; + /* Copy registers */ - sp -= sizeof(struct pt_regs); - childregs = (struct pt_regs *) sp; + childregs = (struct pt_regs *)(sp + STACK_INT_FRAME_REGS); if (unlikely(args->fn)) { /* kernel thread */ + ((unsigned long *)sp)[0] = 0; memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gpr[1] = sp + sizeof(struct pt_regs); + childregs->gpr[1] = sp + STACK_USER_INT_FRAME_SIZE; /* function */ if (args->fn) childregs->gpr[14] = ppc_function_entry((void *)args->fn); @@@ -1783,7 -1750,6 +1783,7 @@@ *childregs = *regs; if (usp) childregs->gpr[1] = usp; + ((unsigned long *)sp)[0] = childregs->gpr[1]; p->thread.regs = childregs; /* 64s sets this in ret_from_fork */ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) @@@ -1801,6 -1767,7 +1801,6 @@@ f = ret_from_fork; } childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX); - sp -= STACK_FRAME_OVERHEAD;
/* * The way this works is that at some point in the future @@@ -1810,12 -1777,11 +1810,12 @@@ * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ - ((unsigned long *)sp)[0] = 0; - sp -= sizeof(struct pt_regs); - kregs = (struct pt_regs *) sp; - sp -= STACK_FRAME_OVERHEAD; + ((unsigned long *)sp)[STACK_FRAME_LR_SAVE] = (unsigned long)f; + sp -= STACK_SWITCH_FRAME_SIZE; + ((unsigned long *)sp)[0] = sp + STACK_SWITCH_FRAME_SIZE; + kregs = (struct pt_regs *)(sp + STACK_SWITCH_FRAME_REGS); p->thread.ksp = sp; + #ifdef CONFIG_HAVE_HW_BREAKPOINT for (i = 0; i < nr_wp_slots(); i++) p->thread.ptrace_bps[i] = NULL; @@@ -2157,12 -2123,9 +2157,12 @@@ static inline int valid_emergency_stack return 0; }
- -int validate_sp(unsigned long sp, struct task_struct *p, - unsigned long nbytes) +/* + * validate the stack frame of a particular minimum size, used for when we are + * looking at a certain object in the stack beyond the minimum. + */ +int validate_sp_size(unsigned long sp, struct task_struct *p, + unsigned long nbytes) { unsigned long stack_page = (unsigned long)task_stack_page(p);
@@@ -2178,10 -2141,7 +2178,10 @@@ return valid_emergency_stack(sp, p, nbytes); }
-EXPORT_SYMBOL(validate_sp); +int validate_sp(unsigned long sp, struct task_struct *p) +{ + return validate_sp_size(sp, p, STACK_FRAME_MIN_SIZE); +}
static unsigned long ___get_wchan(struct task_struct *p) { @@@ -2189,12 -2149,13 +2189,12 @@@ int count = 0;
sp = p->thread.ksp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p)) return 0;
do { sp = READ_ONCE_NOCHECK(*(unsigned long *)sp); - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || - task_is_running(p)) + if (!validate_sp(sp, p) || task_is_running(p)) return 0; if (count > 0) { ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]); @@@ -2248,7 -2209,7 +2248,7 @@@ void __no_sanitize_address show_stack(s lr = 0; printk("%sCall Trace:\n", loglvl); do { - if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, tsk)) break;
stack = (unsigned long *) sp; @@@ -2269,16 -2230,12 +2269,16 @@@
/* * See if this is an exception frame. - * We look for the "regshere" marker in the current frame. + * We look for the "regs" marker in the current frame. + * + * STACK_SWITCH_FRAME_SIZE being the smallest frame that + * could hold a pt_regs, if that does not fit then it can't + * have regs. */ - if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS) - && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + if (validate_sp_size(sp, tsk, STACK_SWITCH_FRAME_SIZE) + && stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) - (sp + STACK_FRAME_OVERHEAD); + (sp + STACK_INT_FRAME_REGS);
lr = regs->link; printk("%s--- interrupt: %lx at %pS\n", @@@ -2346,6 -2303,6 +2346,6 @@@ void notrace __ppc64_runlatch_off(void unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(PAGE_SIZE); + sp -= get_random_u32_below(PAGE_SIZE); return sp & ~0xf; } diff --combined arch/s390/kernel/vdso.c index d6df7169c01f,119328e1e2b3..ff7bf4432229 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@@ -44,6 -44,21 +44,6 @@@ struct vdso_data *arch_get_vdso_data(vo return (struct vdso_data *)(vvar_page); }
-static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - WARN(1, "vvar_page accessed remotely"); - return NULL; -} - /* * The VVAR page layout depends on whether a task belongs to the root or * non-root time namespace. Whenever a task changes its namespace, the VVAR @@@ -69,6 -84,11 +69,6 @@@ int vdso_join_timens(struct task_struc mmap_read_unlock(mm); return 0; } -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} #endif
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, @@@ -207,7 -227,7 +207,7 @@@ static unsigned long vdso_addr(unsigne end -= len;
if (end > start) { - offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; diff --combined arch/um/kernel/process.c index dc1c1aeade90,e38f41444721..47830ade35ed --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@@ -33,7 -33,6 +33,7 @@@ #include <skas.h> #include <registers.h> #include <linux/time-internal.h> +#include <linux/elfcore.h>
/* * This is a per-cpu array. A processor only modifies its entry and it only @@@ -357,7 -356,7 +357,7 @@@ int singlestepping(void * t unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(8192); + sp -= get_random_u32_below(8192); return sp & ~0xf; } #endif @@@ -394,7 -393,7 +394,7 @@@ unsigned long __get_wchan(struct task_s return 0; }
-int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu) +int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) { int cpu = current_thread_info()->cpu;
diff --combined arch/x86/entry/vdso/vma.c index 3c6b488b2f11,d45c5fcfeac2..b8f3f9b9e53c --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@@ -98,6 -98,24 +98,6 @@@ static int vdso_mremap(const struct vm_ }
#ifdef CONFIG_TIME_NS -static struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - /* * The vvar page layout depends on whether a task belongs to the root or * non-root time namespace. Whenever a task changes its namespace, the VVAR @@@ -122,6 -140,11 +122,6 @@@ int vdso_join_timens(struct task_struc
return 0; } -#else -static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - return NULL; -} #endif
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, @@@ -187,10 -210,11 +187,10 @@@ pgprot_decrypted(vma->vm_page_prot)); } } else if (sym_offset == image->sym_hvclock_page) { - struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + pfn = hv_get_tsc_pfn();
- if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) - return vmf_insert_pfn(vma, vmf->address, - virt_to_phys(tsc_pg) >> PAGE_SHIFT); + if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) + return vmf_insert_pfn(vma, vmf->address, pfn); } else if (sym_offset == image->sym_timens_page) { struct page *timens_page = find_timens_vvar_page(vma);
@@@ -303,7 -327,7 +303,7 @@@ static unsigned long vdso_addr(unsigne end -= len;
if (end > start) { - offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1); + offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1); addr = start + (offset << PAGE_SHIFT); } else { addr = start; diff --combined arch/x86/kernel/cpu/common.c index 73cc546e024d,3f66dd03c091..9cfca3d7d0e2 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@@ -22,9 -22,9 +22,9 @@@ #include <linux/io.h> #include <linux/syscore_ops.h> #include <linux/pgtable.h> + #include <linux/stackprotector.h>
#include <asm/cmdline.h> - #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@@ -52,7 -52,6 +52,7 @@@ #include <asm/cpu.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/cacheinfo.h> #include <asm/memtype.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> @@@ -610,7 -609,6 +610,7 @@@ static __always_inline void setup_cet(s
if (!ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); + wrmsrl(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); return; } @@@ -703,6 -701,16 +703,6 @@@ static const char *table_lookup_model(s __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - __loadsegment_simple(gs, 0); - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); -#endif -} - #ifdef CONFIG_X86_32 /* The 32-bit entry code needs to find cpu_entry_area. */ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); @@@ -730,45 -738,16 +730,45 @@@ void load_fixmap_gdt(int cpu } EXPORT_SYMBOL_GPL(load_fixmap_gdt);
-/* - * Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. +/** + * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base + * @cpu: The CPU number for which this is invoked + * + * Invoked during early boot to switch from early GDT and early per CPU to + * the direct GDT and the runtime per CPU area. On 32-bit the percpu base + * switch is implicit by loading the direct GDT. On 64bit this requires + * to update GSBASE. */ -void switch_to_new_gdt(int cpu) +void __init switch_gdt_and_percpu_base(int cpu) { - /* Load the original GDT */ load_direct_gdt(cpu); - /* Reload the per-cpu base */ - load_percpu_segment(cpu); + +#ifdef CONFIG_X86_64 + /* + * No need to load %gs. It is already correct. + * + * Writing %gs on 64bit would zero GSBASE which would make any per + * CPU operation up to the point of the wrmsrl() fault. + * + * Set GSBASE to the new offset. Until the wrmsrl() happens the + * early mapping is still valid. That means the GSBASE update will + * lose any prior per CPU data which was not copied over in + * setup_per_cpu_areas(). + * + * This works even with stackprotector enabled because the + * per CPU stack canary is 0 in both per CPU areas. + */ + wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); +#else + /* + * %fs is already set to __KERNEL_PERCPU, but after switching GDT + * it is required to load FS again so that the 'hidden' part is + * updated from the new GDT. Up to this point the early per CPU + * translation is active. Any content of the early per CPU data + * which was not copied over in setup_per_cpu_areas() is lost. + */ + loadsegment(fs, __KERNEL_PERCPU); +#endif }
static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; @@@ -1969,6 -1948,7 +1969,6 @@@ void identify_secondary_cpu(struct cpui #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); @@@ -2013,18 -1993,27 +2013,18 @@@ static __init int setup_clearcpuid(cha } __setup("clearcpuid=", setup_clearcpuid);
+DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { + .current_task = &init_task, + .preempt_count = INIT_PREEMPT_COUNT, + .top_of_stack = TOP_OF_INIT_STACK, +}; +EXPORT_PER_CPU_SYMBOL(pcpu_hot); + #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __aligned(PAGE_SIZE) __visible; EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
-/* - * The following percpu variables are hot. Align current_task to - * cacheline size such that they fall in the same cacheline. - */ -DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = - &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - -DEFINE_PER_CPU(void *, hardirq_stack_ptr); -DEFINE_PER_CPU(bool, hardirq_stack_inuse); - -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; - static void wrmsrl_cstar(unsigned long val) { /* @@@ -2075,6 -2064,20 +2075,6 @@@ void syscall_init(void
#else /* CONFIG_X86_64 */
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; -EXPORT_PER_CPU_SYMBOL(__preempt_count); - -/* - * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find - * the top of the kernel stack. Use an extra percpu variable to track the - * top of the kernel stack directly. - */ -DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = - (unsigned long)&init_thread_union + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); - #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); @@@ -2245,6 -2248,12 +2245,6 @@@ void cpu_init(void boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - switch_to_new_gdt(cpu); - if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); diff --combined arch/x86/kernel/module.c index 9f9626af2175,a98687642dd0..705fb2a41d7d --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@@ -53,7 -53,7 +53,7 @@@ static unsigned long int get_module_loa */ if (module_load_offset == 0) module_load_offset = - (prandom_u32_max(1024) + 1) * PAGE_SIZE; + get_random_u32_inclusive(1, 1024) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; @@@ -74,11 -74,10 +74,11 @@@ void *module_alloc(unsigned long size return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; @@@ -252,13 -251,14 +252,13 @@@ int module_finalize(const Elf_Ehdr *hdr const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, + *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".text", secstrings + s->sh_name)) - text = s; if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) @@@ -273,10 -273,6 +273,10 @@@ retpolines = s; if (!strcmp(".return_sites", secstrings + s->sh_name)) returns = s; + if (!strcmp(".call_sites", secstrings + s->sh_name)) + calls = s; + if (!strcmp(".cfi_sites", secstrings + s->sh_name)) + cfi = s; if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) ibt_endbr = s; } @@@ -289,22 -285,6 +289,22 @@@ void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + if (retpolines || cfi) { + void *rseg = NULL, *cseg = NULL; + unsigned int rsize = 0, csize = 0; + + if (retpolines) { + rseg = (void *)retpolines->sh_addr; + rsize = retpolines->sh_size; + } + + if (cfi) { + cseg = (void *)cfi->sh_addr; + csize = cfi->sh_size; + } + + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); @@@ -318,32 -298,16 +318,32 @@@ void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (calls || para) { + struct callthunk_sites cs = {}; + + if (calls) { + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; + } + + if (para) { + cs.pv_start = (void *)para->sh_addr; + cs.pv_end = (void *)para->sh_addr + para->sh_size; + } + + callthunks_patch_module_calls(&cs, me); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); } - if (locks && text) { + if (locks) { void *lseg = (void *)locks->sh_addr; - void *tseg = (void *)text->sh_addr; + void *text = me->core_layout.base; + void *text_end = text + me->core_layout.text_size; alternatives_smp_module_add(me, me->name, lseg, lseg + locks->sh_size, - tseg, tseg + text->sh_size); + text, text_end); }
if (orc && orc_ip) diff --combined arch/x86/kernel/process.c index 46f06ff6c7c6,62671ccf0404..ef6bde1d40d8 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@@ -47,7 -47,6 +47,7 @@@ #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> +#include <asm/mmu_context.h>
#include "process.h"
@@@ -368,8 -367,6 +368,8 @@@ void arch_setup_new_exec(void task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } + + mm_reset_untag_mask(current->mm); }
#ifdef CONFIG_X86_IOPL_IOPERM @@@ -603,7 -600,7 +603,7 @@@ static __always_inline void __speculati }
if (updmsr) - write_spec_ctrl_current(msr, false); + update_spec_ctrl_cond(msr); }
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@@ -968,7 -965,7 +968,7 @@@ early_param("idle", idle_setup) unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= prandom_u32_max(8192); + sp -= get_random_u32_below(8192); return sp & ~0xf; }
diff --combined arch/x86/kernel/setup_percpu.c index c2fc4c41c164,b26123c90b4f..c242dc47e9cb --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@@ -11,6 -11,7 +11,7 @@@ #include <linux/smp.h> #include <linux/topology.h> #include <linux/pfn.h> + #include <linux/stackprotector.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/desc.h> @@@ -21,8 -22,10 +22,7 @@@ #include <asm/proto.h> #include <asm/cpumask.h> #include <asm/cpu.h> - #include <asm/stackprotector.h>
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - #ifdef CONFIG_X86_64 #define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) #else @@@ -169,7 -172,7 +169,7 @@@ void __init setup_per_cpu_areas(void for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(cpu_number, cpu) = cpu; + per_cpu(pcpu_hot.cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the @@@ -208,7 -211,7 +208,7 @@@ * area. Reload any changed state for the boot CPU. */ if (!cpu) - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); }
/* indicate the early static arrays will soon be gone */ diff --combined arch/x86/kernel/smpboot.c index 26937f28000b,5a742b6ec46d..55cad72715d9 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@@ -56,9 -56,9 +56,10 @@@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> + #include <linux/stackprotector.h>
#include <asm/acpi.h> +#include <asm/cacheinfo.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@@ -1047,7 -1047,7 +1048,7 @@@ int common_cpu_up(unsigned int cpu, str /* Just in case we booted with a single CPU. */ alternatives_enable_smp();
- per_cpu(current_task, cpu) = idle; + per_cpu(pcpu_hot.current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */ @@@ -1057,7 -1057,7 +1058,7 @@@
#ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif @@@ -1429,6 -1429,8 +1430,6 @@@ void __init native_smp_prepare_cpus(uns
uv_system_init();
- set_mtrr_aps_delayed_init(); - smp_quirk_init_udelay();
speculative_store_bypass_ht_init(); @@@ -1438,12 -1440,12 +1439,12 @@@
void arch_thaw_secondary_cpus_begin(void) { - set_mtrr_aps_delayed_init(); + set_cache_aps_delayed_init(true); }
void arch_thaw_secondary_cpus_end(void) { - mtrr_aps_init(); + cache_aps_init(); }
/* @@@ -1452,11 -1454,7 +1453,11 @@@ void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(me); + + /* SMP handles this from setup_per_cpu_areas() */ + if (!IS_ENABLED(CONFIG_SMP)) + switch_gdt_and_percpu_base(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); cpu_set_state_online(me); @@@ -1490,7 -1488,7 +1491,7 @@@ void __init native_smp_cpus_done(unsign
nmi_selftest(); impress_friends(); - mtrr_aps_init(); + cache_aps_init(); }
static int __initdata setup_possible_cpus = -1; diff --combined arch/x86/xen/enlighten_pv.c index a7d83c7800e4,745420853a7c..5b1379662877 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@@ -23,7 -23,6 +23,7 @@@ #include <linux/start_kernel.h> #include <linux/sched.h> #include <linux/kprobes.h> +#include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/export.h> #include <linux/mm.h> @@@ -33,6 -32,7 +33,7 @@@ #include <linux/edd.h> #include <linux/reboot.h> #include <linux/virtio_anchor.h> + #include <linux/stackprotector.h>
#include <xen/xen.h> #include <xen/events.h> @@@ -65,7 -65,6 +66,6 @@@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/reboot.h> - #include <asm/stackprotector.h> #include <asm/hypervisor.h> #include <asm/mach_traps.h> #include <asm/mwait.h> @@@ -114,7 -113,7 +114,7 @@@ static __read_mostly bool xen_msr_safe static int __init parse_xen_msr_safe(char *str) { if (str) - return strtobool(str, &xen_msr_safe); + return kstrtobool(str, &xen_msr_safe); return -EINVAL; } early_param("xen_msr_safe", parse_xen_msr_safe); @@@ -1210,7 -1209,7 +1210,7 @@@ static void __init xen_setup_gdt(int cp pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; @@@ -1266,8 -1265,6 +1266,8 @@@ asmlinkage __visible void __init xen_st xen_vcpu_info_reset(0);
x86_platform.get_nmi_reason = xen_get_nmi_reason; + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop;
x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; diff --combined crypto/rsa-pkcs1pad.c index 3bc76edb3f8a,e75728f87ce5..6ee5b8a060c0 --- a/crypto/rsa-pkcs1pad.c +++ b/crypto/rsa-pkcs1pad.c @@@ -253,7 -253,7 +253,7 @@@ static int pkcs1pad_encrypt(struct akci ps_end = ctx->key_size - req->src_len - 2; req_ctx->in_buf[0] = 0x02; for (i = 1; i < ps_end; i++) - req_ctx->in_buf[i] = 1 + prandom_u32_max(255); + req_ctx->in_buf[i] = get_random_u32_inclusive(1, 255); req_ctx->in_buf[ps_end] = 0x00;
pkcs1pad_sg_set_buf(req_ctx->in_sg, req_ctx->in_buf, @@@ -579,10 -579,6 +579,10 @@@ static int pkcs1pad_init_tfm(struct cry return PTR_ERR(child_tfm);
ctx->child = child_tfm; + + akcipher_set_reqsize(tfm, sizeof(struct pkcs1pad_request) + + crypto_akcipher_reqsize(child_tfm)); + return 0; }
@@@ -678,6 -674,7 +678,6 @@@ static int pkcs1pad_create(struct crypt inst->alg.set_pub_key = pkcs1pad_set_pub_key; inst->alg.set_priv_key = pkcs1pad_set_priv_key; inst->alg.max_size = pkcs1pad_get_max_size; - inst->alg.reqsize = sizeof(struct pkcs1pad_request) + rsa_alg->reqsize;
inst->free = pkcs1pad_free;
diff --combined crypto/testmgr.c index 279aa609b7e6,e669acd2ebdd..4476ac97baa5 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@@ -766,7 -766,7 +766,7 @@@ static int build_cipher_test_sglists(st struct iov_iter input; int err;
- iov_iter_kvec(&input, WRITE, inputs, nr_inputs, src_total_len); + iov_iter_kvec(&input, ITER_SOURCE, inputs, nr_inputs, src_total_len); err = build_test_sglist(&tsgls->src, cfg->src_divs, alignmask, cfg->inplace_mode != OUT_OF_PLACE ? max(dst_total_len, src_total_len) : @@@ -855,9 -855,9 +855,9 @@@ static int prepare_keybuf(const u8 *key /* Generate a random length in range [0, max_len], but prefer smaller values */ static unsigned int generate_random_length(unsigned int max_len) { - unsigned int len = prandom_u32_max(max_len + 1); + unsigned int len = get_random_u32_below(max_len + 1);
- switch (prandom_u32_max(4)) { + switch (get_random_u32_below(4)) { case 0: return len % 64; case 1: @@@ -874,14 -874,14 +874,14 @@@ static void flip_random_bit(u8 *buf, si { size_t bitpos;
- bitpos = prandom_u32_max(size * 8); + bitpos = get_random_u32_below(size * 8); buf[bitpos / 8] ^= 1 << (bitpos % 8); }
/* Flip a random byte in the given nonempty data buffer */ static void flip_random_byte(u8 *buf, size_t size) { - buf[prandom_u32_max(size)] ^= 0xff; + buf[get_random_u32_below(size)] ^= 0xff; }
/* Sometimes make some random changes to the given nonempty data buffer */ @@@ -891,15 -891,15 +891,15 @@@ static void mutate_buffer(u8 *buf, size size_t i;
/* Sometimes flip some bits */ - if (prandom_u32_max(4) == 0) { - num_flips = min_t(size_t, 1 << prandom_u32_max(8), size * 8); + if (get_random_u32_below(4) == 0) { + num_flips = min_t(size_t, 1 << get_random_u32_below(8), size * 8); for (i = 0; i < num_flips; i++) flip_random_bit(buf, size); }
/* Sometimes flip some bytes */ - if (prandom_u32_max(4) == 0) { - num_flips = min_t(size_t, 1 << prandom_u32_max(8), size); + if (get_random_u32_below(4) == 0) { + num_flips = min_t(size_t, 1 << get_random_u32_below(8), size); for (i = 0; i < num_flips; i++) flip_random_byte(buf, size); } @@@ -915,11 -915,11 +915,11 @@@ static void generate_random_bytes(u8 *b if (count == 0) return;
- switch (prandom_u32_max(8)) { /* Choose a generation strategy */ + switch (get_random_u32_below(8)) { /* Choose a generation strategy */ case 0: case 1: /* All the same byte, plus optional mutations */ - switch (prandom_u32_max(4)) { + switch (get_random_u32_below(4)) { case 0: b = 0x00; break; @@@ -959,24 -959,24 +959,24 @@@ static char *generate_random_sgl_divisi unsigned int this_len; const char *flushtype_str;
- if (div == &divs[max_divs - 1] || prandom_u32_max(2) == 0) + if (div == &divs[max_divs - 1] || get_random_u32_below(2) == 0) this_len = remaining; else - this_len = 1 + prandom_u32_max(remaining); + this_len = get_random_u32_inclusive(1, remaining); div->proportion_of_total = this_len;
- if (prandom_u32_max(4) == 0) - div->offset = (PAGE_SIZE - 128) + prandom_u32_max(128); - else if (prandom_u32_max(2) == 0) - div->offset = prandom_u32_max(32); + if (get_random_u32_below(4) == 0) + div->offset = get_random_u32_inclusive(PAGE_SIZE - 128, PAGE_SIZE - 1); + else if (get_random_u32_below(2) == 0) + div->offset = get_random_u32_below(32); else - div->offset = prandom_u32_max(PAGE_SIZE); - if (prandom_u32_max(8) == 0) + div->offset = get_random_u32_below(PAGE_SIZE); + if (get_random_u32_below(8) == 0) div->offset_relative_to_alignmask = true;
div->flush_type = FLUSH_TYPE_NONE; if (gen_flushes) { - switch (prandom_u32_max(4)) { + switch (get_random_u32_below(4)) { case 0: div->flush_type = FLUSH_TYPE_REIMPORT; break; @@@ -988,7 -988,7 +988,7 @@@
if (div->flush_type != FLUSH_TYPE_NONE && !(req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && - prandom_u32_max(2) == 0) + get_random_u32_below(2) == 0) div->nosimd = true;
switch (div->flush_type) { @@@ -1035,7 -1035,7 +1035,7 @@@ static void generate_random_testvec_con
p += scnprintf(p, end - p, "random:");
- switch (prandom_u32_max(4)) { + switch (get_random_u32_below(4)) { case 0: case 1: cfg->inplace_mode = OUT_OF_PLACE; @@@ -1050,12 -1050,12 +1050,12 @@@ break; }
- if (prandom_u32_max(2) == 0) { + if (get_random_u32_below(2) == 0) { cfg->req_flags |= CRYPTO_TFM_REQ_MAY_SLEEP; p += scnprintf(p, end - p, " may_sleep"); }
- switch (prandom_u32_max(4)) { + switch (get_random_u32_below(4)) { case 0: cfg->finalization_type = FINALIZATION_TYPE_FINAL; p += scnprintf(p, end - p, " use_final"); @@@ -1071,7 -1071,7 +1071,7 @@@ }
if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && - prandom_u32_max(2) == 0) { + get_random_u32_below(2) == 0) { cfg->nosimd = true; p += scnprintf(p, end - p, " nosimd"); } @@@ -1084,7 -1084,7 +1084,7 @@@ cfg->req_flags); p += scnprintf(p, end - p, "]");
- if (cfg->inplace_mode == OUT_OF_PLACE && prandom_u32_max(2) == 0) { + if (cfg->inplace_mode == OUT_OF_PLACE && get_random_u32_below(2) == 0) { p += scnprintf(p, end - p, " dst_divs=["); p = generate_random_sgl_divisions(cfg->dst_divs, ARRAY_SIZE(cfg->dst_divs), @@@ -1093,13 -1093,13 +1093,13 @@@ p += scnprintf(p, end - p, "]"); }
- if (prandom_u32_max(2) == 0) { - cfg->iv_offset = 1 + prandom_u32_max(MAX_ALGAPI_ALIGNMASK); + if (get_random_u32_below(2) == 0) { + cfg->iv_offset = get_random_u32_inclusive(1, MAX_ALGAPI_ALIGNMASK); p += scnprintf(p, end - p, " iv_offset=%u", cfg->iv_offset); }
- if (prandom_u32_max(2) == 0) { - cfg->key_offset = 1 + prandom_u32_max(MAX_ALGAPI_ALIGNMASK); + if (get_random_u32_below(2) == 0) { + cfg->key_offset = get_random_u32_inclusive(1, MAX_ALGAPI_ALIGNMASK); p += scnprintf(p, end - p, " key_offset=%u", cfg->key_offset); }
@@@ -1180,7 -1180,7 +1180,7 @@@ static int build_hash_sglist(struct tes
kv.iov_base = (void *)vec->plaintext; kv.iov_len = vec->psize; - iov_iter_kvec(&input, WRITE, &kv, 1, vec->psize); + iov_iter_kvec(&input, ITER_SOURCE, &kv, 1, vec->psize); return build_test_sglist(tsgl, cfg->src_divs, alignmask, vec->psize, &input, divs); } @@@ -1652,8 -1652,8 +1652,8 @@@ static void generate_random_hash_testve vec->ksize = 0; if (maxkeysize) { vec->ksize = maxkeysize; - if (prandom_u32_max(4) == 0) - vec->ksize = 1 + prandom_u32_max(maxkeysize); + if (get_random_u32_below(4) == 0) + vec->ksize = get_random_u32_inclusive(1, maxkeysize); generate_random_bytes((u8 *)vec->key, vec->ksize);
vec->setkey_error = crypto_shash_setkey(desc->tfm, vec->key, @@@ -2218,13 -2218,13 +2218,13 @@@ static void mutate_aead_message(struct const unsigned int aad_tail_size = aad_iv ? ivsize : 0; const unsigned int authsize = vec->clen - vec->plen;
- if (prandom_u32_max(2) == 0 && vec->alen > aad_tail_size) { + if (get_random_u32_below(2) == 0 && vec->alen > aad_tail_size) { /* Mutate the AAD */ flip_random_bit((u8 *)vec->assoc, vec->alen - aad_tail_size); - if (prandom_u32_max(2) == 0) + if (get_random_u32_below(2) == 0) return; } - if (prandom_u32_max(2) == 0) { + if (get_random_u32_below(2) == 0) { /* Mutate auth tag (assuming it's at the end of ciphertext) */ flip_random_bit((u8 *)vec->ctext + vec->plen, authsize); } else { @@@ -2249,7 -2249,7 +2249,7 @@@ static void generate_aead_message(struc const unsigned int ivsize = crypto_aead_ivsize(tfm); const unsigned int authsize = vec->clen - vec->plen; const bool inauthentic = (authsize >= MIN_COLLISION_FREE_AUTHSIZE) && - (prefer_inauthentic || prandom_u32_max(4) == 0); + (prefer_inauthentic || get_random_u32_below(4) == 0);
/* Generate the AAD. */ generate_random_bytes((u8 *)vec->assoc, vec->alen); @@@ -2257,7 -2257,7 +2257,7 @@@ /* Avoid implementation-defined behavior. */ memcpy((u8 *)vec->assoc + vec->alen - ivsize, vec->iv, ivsize);
- if (inauthentic && prandom_u32_max(2) == 0) { + if (inauthentic && get_random_u32_below(2) == 0) { /* Generate a random ciphertext. */ generate_random_bytes((u8 *)vec->ctext, vec->clen); } else { @@@ -2321,8 -2321,8 +2321,8 @@@ static void generate_random_aead_testve
/* Key: length in [0, maxkeysize], but usually choose maxkeysize */ vec->klen = maxkeysize; - if (prandom_u32_max(4) == 0) - vec->klen = prandom_u32_max(maxkeysize + 1); + if (get_random_u32_below(4) == 0) + vec->klen = get_random_u32_below(maxkeysize + 1); generate_random_bytes((u8 *)vec->key, vec->klen); vec->setkey_error = crypto_aead_setkey(tfm, vec->key, vec->klen);
@@@ -2331,8 -2331,8 +2331,8 @@@
/* Tag length: in [0, maxauthsize], but usually choose maxauthsize */ authsize = maxauthsize; - if (prandom_u32_max(4) == 0) - authsize = prandom_u32_max(maxauthsize + 1); + if (get_random_u32_below(4) == 0) + authsize = get_random_u32_below(maxauthsize + 1); if (prefer_inauthentic && authsize < MIN_COLLISION_FREE_AUTHSIZE) authsize = MIN_COLLISION_FREE_AUTHSIZE; if (WARN_ON(authsize > maxdatasize)) @@@ -2342,7 -2342,7 +2342,7 @@@
/* AAD, plaintext, and ciphertext lengths */ total_len = generate_random_length(maxdatasize); - if (prandom_u32_max(4) == 0) + if (get_random_u32_below(4) == 0) vec->alen = 0; else vec->alen = generate_random_length(total_len); @@@ -2958,8 -2958,8 +2958,8 @@@ static void generate_random_cipher_test
/* Key: length in [0, maxkeysize], but usually choose maxkeysize */ vec->klen = maxkeysize; - if (prandom_u32_max(4) == 0) - vec->klen = prandom_u32_max(maxkeysize + 1); + if (get_random_u32_below(4) == 0) + vec->klen = get_random_u32_below(maxkeysize + 1); generate_random_bytes((u8 *)vec->key, vec->klen); vec->setkey_error = crypto_skcipher_setkey(tfm, vec->key, vec->klen);
@@@ -4712,12 -4712,6 +4712,12 @@@ static const struct alg_test_desc alg_t .alg = "cts(cbc(paes))", .test = alg_test_null, .fips_allowed = 1, + }, { + .alg = "cts(cbc(sm4))", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(sm4_cts_tv_template) + } }, { .alg = "curve25519", .test = alg_test_kpp, @@@ -5592,12 -5586,6 +5592,12 @@@ .suite = { .hash = __VECS(aes_xcbc128_tv_template) } + }, { + .alg = "xcbc(sm4)", + .test = alg_test_hash, + .suite = { + .hash = __VECS(sm4_xcbc128_tv_template) + } }, { .alg = "xchacha12", .test = alg_test_skcipher, @@@ -5652,13 -5640,6 +5652,13 @@@ .suite = { .cipher = __VECS(serpent_xts_tv_template) } + }, { + .alg = "xts(sm4)", + .generic_driver = "xts(ecb(sm4-generic))", + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(sm4_xts_tv_template) + } }, { .alg = "xts(twofish)", .generic_driver = "xts(ecb(twofish-generic))", diff --combined drivers/block/drbd/drbd_receiver.c index d402ba27574f,3eccc6cd5004..0e58a3187345 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@@ -1,4 -1,4 +1,4 @@@ -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-License-Identifier: GPL-2.0-only /* drbd_receiver.c
@@@ -413,7 -413,7 +413,7 @@@ void __drbd_free_peer_req(struct drbd_d drbd_free_pages(device, peer_req->pages, is_net); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; drbd_al_complete_io(device, &peer_req->i); } @@@ -507,7 -507,7 +507,7 @@@ static int drbd_recv_short(struct socke struct msghdr msg = { .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size); return sock_recvmsg(sock, &msg, msg.msg_flags); }
@@@ -781,7 -781,7 +781,7 @@@ static struct socket *drbd_wait_for_con
timeo = connect_int * HZ; /* 28.5% random jitter */ - timeo += prandom_u32_max(2) ? timeo / 7 : -timeo / 7; + timeo += get_random_u32_below(2) ? timeo / 7 : -timeo / 7;
err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); if (err <= 0) @@@ -1004,7 -1004,7 +1004,7 @@@ retry drbd_warn(connection, "Error receiving initial packet\n"); sock_release(s); randomize: - if (prandom_u32_max(2)) + if (get_random_u32_below(2)) goto retry; } } @@@ -1603,19 -1603,9 +1603,19 @@@ static void drbd_issue_peer_discard_or_ drbd_endio_write_sec_final(peer_req); }
+static int peer_request_fault_type(struct drbd_peer_request *peer_req) +{ + if (peer_req_op(peer_req) == REQ_OP_READ) { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD; + } else { + return peer_req->flags & EE_APPLICATION ? + DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR; + } +} + /** * drbd_submit_peer_request() - * @device: DRBD device. * @peer_req: peer request * * May spread the pages to multiple bios, @@@ -1629,9 -1619,10 +1629,9 @@@ * on certain Xen deployments. */ /* TODO allocate from our own bio_set. */ -int drbd_submit_peer_request(struct drbd_device *device, - struct drbd_peer_request *peer_req, - const blk_opf_t opf, const int fault_type) +int drbd_submit_peer_request(struct drbd_peer_request *peer_req) { + struct drbd_device *device = peer_req->peer_device->device; struct bio *bios = NULL; struct bio *bio; struct page *page = peer_req->pages; @@@ -1676,18 -1667,7 +1676,18 @@@ * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO); + /* _DISCARD, _WRITE_ZEROES handled above. + * REQ_OP_FLUSH (empty flush) not expected, + * should have been mapped to a "drbd protocol barrier". + * REQ_OP_SECURE_ERASE: I don't see how we could ever support that. + */ + if (!(peer_req_op(peer_req) == REQ_OP_WRITE || + peer_req_op(peer_req) == REQ_OP_READ)) { + drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf); + return -EINVAL; + } + + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; bio->bi_private = peer_req; @@@ -1717,7 -1697,7 +1717,7 @@@ bios = bios->bi_next; bio->bi_next = NULL;
- drbd_submit_bio_noacct(device, fault_type, bio); + drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio); } while (bios); return 0; } @@@ -1873,21 -1853,21 +1873,21 @@@ read_in_block(struct drbd_peer_device * /* assume request_size == data_size, but special case trim. */ ds = data_size; if (trim) { - if (!expect(data_size == 0)) + if (!expect(peer_device, data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); } else if (zeroes) { - if (!expect(data_size == 0)) + if (!expect(peer_device, data_size == 0)) return NULL; ds = be32_to_cpu(zeroes->size); }
- if (!expect(IS_ALIGNED(ds, 512))) + if (!expect(peer_device, IS_ALIGNED(ds, 512))) return NULL; if (trim || zeroes) { - if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) + if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; - } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) + } else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE)) return NULL;
/* even though we trust out peer, @@@ -2071,7 -2051,6 +2071,7 @@@ static int recv_resync_read(struct drbd * respective _drbd_clear_done_ee */
peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_WRITE; peer_req->submit_jif = jiffies;
spin_lock_irq(&device->resource->req_lock); @@@ -2079,7 -2058,8 +2079,7 @@@ spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, - DRBD_FAULT_RS_WR) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0;
/* don't care for the reason here */ @@@ -2165,7 -2145,7 +2165,7 @@@ static int receive_RSDataReply(struct d * or in drbd_peer_request_endio. */ err = recv_resync_read(peer_device, sector, pi); } else { - if (__ratelimit(&drbd_ratelimit_state)) + if (drbd_ratelimit()) drbd_err(device, "Can not write resync data to local disk.\n");
err = drbd_drain_block(peer_device, pi->size); @@@ -2395,6 -2375,16 +2395,6 @@@ static int wait_for_and_update_peer_seq return ret; }
-/* see also bio_flags_to_wire() - * DRBD_REQ_*, because we need to semantically map the flags to data packet - * flags and back. We may replicate to other kernel versions. */ -static blk_opf_t wire_flags_to_bio_flags(u32 dpf) -{ - return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | - (dpf & DP_FUA ? REQ_FUA : 0) | - (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); -} - static enum req_op wire_flags_to_bio_op(u32 dpf) { if (dpf & DP_ZEROES) @@@ -2405,15 -2395,6 +2405,15 @@@ return REQ_OP_WRITE; }
+/* see also bio_flags_to_wire() */ +static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf) +{ + return wire_flags_to_bio_op(dpf) | + (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); +} + static void fail_postponed_requests(struct drbd_device *device, sector_t sector, unsigned int size) { @@@ -2557,6 -2538,8 +2557,6 @@@ static int receive_Data(struct drbd_con struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - enum req_op op; - blk_opf_t op_flags; u32 dp_flags; int err, tp;
@@@ -2595,10 -2578,11 +2595,10 @@@ peer_req->flags |= EE_APPLICATION;
dp_flags = be32_to_cpu(p->dp_flags); - op = wire_flags_to_bio_op(dp_flags); - op_flags = wire_flags_to_bio_flags(dp_flags); + peer_req->opf = wire_flags_to_bio(connection, dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, op == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD); D_ASSERT(peer_device, peer_req->pages == NULL); /* need to play safe: an older DRBD sender * may mean zero-out while sending P_TRIM. */ @@@ -2606,7 -2590,7 +2606,7 @@@ peer_req->flags |= EE_ZEROOUT; } else if (pi->cmd == P_ZEROES) { D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); + D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ if (dp_flags & DP_DISCARD) @@@ -2693,7 -2677,8 +2693,7 @@@ peer_req->flags |= EE_CALL_AL_COMPLETE_IO; }
- err = drbd_submit_peer_request(device, peer_req, op | op_flags, - DRBD_FAULT_DT_WR); + err = drbd_submit_peer_request(peer_req); if (!err) return 0;
@@@ -2804,6 -2789,7 +2804,6 @@@ static int receive_DataRequest(struct d struct drbd_peer_request *peer_req; struct digest_info *di = NULL; int size, verb; - unsigned int fault_type; struct p_block_req *p = pi->data;
peer_device = conn_peer_device(connection, pi->vnr); @@@ -2846,7 -2832,7 +2846,7 @@@ default: BUG(); } - if (verb && __ratelimit(&drbd_ratelimit_state)) + if (verb && drbd_ratelimit()) drbd_err(device, "Can not satisfy peer's read request, " "no local data.\n");
@@@ -2863,11 -2849,11 +2863,11 @@@ put_ldev(device); return -ENOMEM; } + peer_req->opf = REQ_OP_READ;
switch (pi->cmd) { case P_DATA_REQUEST: peer_req->w.cb = w_e_end_data_req; - fault_type = DRBD_FAULT_DT_RD; /* application IO, don't drbd_rs_begin_io */ peer_req->flags |= EE_APPLICATION; goto submit; @@@ -2881,12 -2867,14 +2881,12 @@@ fallthrough; case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; - fault_type = DRBD_FAULT_RS_RD; /* used in the sector offset progress display */ device->bm_resync_fo = BM_SECT_TO_BIT(sector); break;
case P_OV_REPLY: case P_CSUM_RS_REQUEST: - fault_type = DRBD_FAULT_RS_RD; di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); if (!di) goto out_free_e; @@@ -2935,6 -2923,7 +2935,6 @@@ (unsigned long long)sector); } peer_req->w.cb = w_e_end_ov_req; - fault_type = DRBD_FAULT_RS_RD; break;
default: @@@ -2986,7 -2975,8 +2986,7 @@@ submit_for_resync submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, - fault_type) == 0) + if (drbd_submit_peer_request(peer_req) == 0) return 0;
/* don't care for the reason here */ @@@ -4957,6 -4947,7 +4957,6 @@@ static int receive_rs_deallocated(struc
if (get_ldev(device)) { struct drbd_peer_request *peer_req; - const enum req_op op = REQ_OP_WRITE_ZEROES;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, size, 0, GFP_NOIO); @@@ -4966,7 -4957,6 +4966,7 @@@ }
peer_req->w.cb = e_end_resync_block; + peer_req->opf = REQ_OP_DISCARD; peer_req->submit_jif = jiffies; peer_req->flags |= EE_TRIM;
@@@ -4975,7 -4965,8 +4975,7 @@@ spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(device, peer_req, op, - DRBD_FAULT_RS_WR); + err = drbd_submit_peer_request(peer_req);
if (err) { spin_lock_irq(&device->resource->req_lock); diff --combined drivers/char/hw_random/core.c index afde685f5e0a,63a0a8e4505d..f34d356fe2c0 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@@ -41,14 -41,14 +41,14 @@@ static DEFINE_MUTEX(reading_mutex) static int data_avail; static u8 *rng_buffer, *rng_fillbuf; static unsigned short current_quality; -static unsigned short default_quality; /* = 0; default to "off" */ +static unsigned short default_quality = 1024; /* default to maximum */
module_param(current_quality, ushort, 0644); MODULE_PARM_DESC(current_quality, "current hwrng entropy estimation per 1024 bits of input -- obsolete, use rng_quality instead"); module_param(default_quality, ushort, 0644); MODULE_PARM_DESC(default_quality, - "default entropy content of hwrng per 1024 bits of input"); + "default maximum entropy content of hwrng per 1024 bits of input");
static void drop_current_rng(void); static int hwrng_init(struct hwrng *rng); @@@ -69,8 -69,10 +69,10 @@@ static void add_early_randomness(struc mutex_lock(&reading_mutex); bytes_read = rng_get_data(rng, rng_fillbuf, 32, 0); mutex_unlock(&reading_mutex); - if (bytes_read > 0) - add_device_randomness(rng_fillbuf, bytes_read); + if (bytes_read > 0) { + size_t entropy = bytes_read * 8 * rng->quality / 1024; + add_hwgenerator_randomness(rng_fillbuf, bytes_read, entropy, false); + } }
static inline void cleanup_rng(struct kref *kref) @@@ -170,7 -172,10 +172,7 @@@ static int hwrng_init(struct hwrng *rng reinit_completion(&rng->cleanup_done);
skip_init: - if (!rng->quality) - rng->quality = default_quality; - if (rng->quality > 1024) - rng->quality = 1024; + rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); current_quality = rng->quality; /* obsolete */
return 0; @@@ -525,7 -530,7 +527,7 @@@ static int hwrng_fillfn(void *unused
/* Outside lock, sure, but y'know: randomness. */ add_hwgenerator_randomness((void *)rng_fillbuf, rc, - entropy >> 10); + entropy >> 10, true); } hwrng_fill = NULL; return 0; diff --combined drivers/char/random.c index b96751b63b58,e872acc1238f..5885ed574c6a --- a/drivers/char/random.c +++ b/drivers/char/random.c @@@ -53,6 -53,7 +53,7 @@@ #include <linux/uaccess.h> #include <linux/suspend.h> #include <linux/siphash.h> + #include <linux/sched/isolation.h> #include <crypto/chacha.h> #include <crypto/blake2s.h> #include <asm/processor.h> @@@ -84,6 -85,7 +85,7 @@@ static DEFINE_STATIC_KEY_FALSE(crng_is_ /* Various types of waiters for crng_init->CRNG_READY transition. */ static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); static struct fasync_struct *fasync; + static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);
/* Control how we warn userspace. */ static struct ratelimit_state urandom_warning = @@@ -120,7 -122,7 +122,7 @@@ static void try_to_generate_entropy(voi * Wait for the input pool to be seeded and thus guaranteed to supply * cryptographically secure random numbers. This applies to: the /dev/urandom * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64, - * int,long} family of functions. Using any of these functions without first + * long} family of functions. Using any of these functions without first * calling this function forfeits the guarantee of security. * * Returns: 0 if the input pool has been seeded. @@@ -140,6 -142,26 +142,26 @@@ int wait_for_random_bytes(void } EXPORT_SYMBOL(wait_for_random_bytes);
+ /* + * Add a callback function that will be invoked when the crng is initialised, + * or immediately if it already has been. Only use this is you are absolutely + * sure it is required. Most users should instead be able to test + * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`. + */ + int __cold execute_with_initialized_rng(struct notifier_block *nb) + { + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&random_ready_notifier.lock, flags); + if (crng_ready()) + nb->notifier_call(nb, 0, NULL); + else + ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb); + spin_unlock_irqrestore(&random_ready_notifier.lock, flags); + return ret; + } + #define warn_unseeded_randomness() \ if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \ printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \ @@@ -160,6 -182,9 +182,9 @@@ * u8 get_random_u8() * u16 get_random_u16() * u32 get_random_u32() + * u32 get_random_u32_below(u32 ceil) + * u32 get_random_u32_above(u32 floor) + * u32 get_random_u32_inclusive(u32 floor, u32 ceil) * u64 get_random_u64() * unsigned long get_random_long() * @@@ -179,7 -204,6 +204,6 @@@ enum
static struct { u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long)); - unsigned long birth; unsigned long generation; spinlock_t lock; } base_crng = { @@@ -197,16 -221,41 +221,41 @@@ static DEFINE_PER_CPU(struct crng, crng .lock = INIT_LOCAL_LOCK(crngs.lock), };
+ /* + * Return the interval until the next reseeding, which is normally + * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval + * proportional to the uptime. + */ + static unsigned int crng_reseed_interval(void) + { + static bool early_boot = true; + + if (unlikely(READ_ONCE(early_boot))) { + time64_t uptime = ktime_get_seconds(); + if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2) + WRITE_ONCE(early_boot, false); + else + return max_t(unsigned int, CRNG_RESEED_START_INTERVAL, + (unsigned int)uptime / 2 * HZ); + } + return CRNG_RESEED_INTERVAL; + } + /* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */ static void extract_entropy(void *buf, size_t len);
/* This extracts a new crng key from the input pool. */ - static void crng_reseed(void) + static void crng_reseed(struct work_struct *work) { + static DECLARE_DELAYED_WORK(next_reseed, crng_reseed); unsigned long flags; unsigned long next_gen; u8 key[CHACHA_KEY_SIZE];
+ /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */ + if (likely(system_unbound_wq)) + queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval()); + extract_entropy(key, sizeof(key));
/* @@@ -221,7 -270,6 +270,6 @@@ if (next_gen == ULONG_MAX) ++next_gen; WRITE_ONCE(base_crng.generation, next_gen); - WRITE_ONCE(base_crng.birth, jiffies); if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; spin_unlock_irqrestore(&base_crng.lock, flags); @@@ -260,26 -308,6 +308,6 @@@ static void crng_fast_key_erasure(u8 ke memzero_explicit(first_block, sizeof(first_block)); }
- /* - * Return the interval until the next reseeding, which is normally - * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval - * proportional to the uptime. - */ - static unsigned int crng_reseed_interval(void) - { - static bool early_boot = true; - - if (unlikely(READ_ONCE(early_boot))) { - time64_t uptime = ktime_get_seconds(); - if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2) - WRITE_ONCE(early_boot, false); - else - return max_t(unsigned int, CRNG_RESEED_START_INTERVAL, - (unsigned int)uptime / 2 * HZ); - } - return CRNG_RESEED_INTERVAL; - } - /* * This function returns a ChaCha state that you may use for generating * random data. It also returns up to 32 bytes on its own of random data @@@ -315,13 -343,6 +343,6 @@@ static void crng_make_state(u32 chacha_ return; }
- /* - * If the base_crng is old enough, we reseed, which in turn bumps the - * generation counter that we check below. - */ - if (unlikely(time_is_before_jiffies(READ_ONCE(base_crng.birth) + crng_reseed_interval()))) - crng_reseed(); - local_lock_irqsave(&crngs.lock, flags); crng = raw_cpu_ptr(&crngs);
@@@ -383,11 -404,11 +404,11 @@@ static void _get_random_bytes(void *buf }
/* - * This function is the exported kernel interface. It returns some number of - * good random numbers, suitable for key generation, seeding TCP sequence - * numbers, etc. In order to ensure that the randomness returned by this - * function is okay, the function wait_for_random_bytes() should be called and - * return 0 at least once at any point prior. + * This returns random bytes in arbitrary quantities. The quality of the + * random bytes is good as /dev/urandom. In order to ensure that the + * randomness provided by this function is okay, the function + * wait_for_random_bytes() should be called and return 0 at least once + * at any point prior. */ void get_random_bytes(void *buf, size_t len) { @@@ -510,6 -531,41 +531,41 @@@ DEFINE_BATCHED_ENTROPY(u16 DEFINE_BATCHED_ENTROPY(u32) DEFINE_BATCHED_ENTROPY(u64)
+ u32 __get_random_u32_below(u32 ceil) + { + /* + * This is the slow path for variable ceil. It is still fast, most of + * the time, by doing traditional reciprocal multiplication and + * opportunistically comparing the lower half to ceil itself, before + * falling back to computing a larger bound, and then rejecting samples + * whose lower half would indicate a range indivisible by ceil. The use + * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable + * in 32-bits. + */ + u32 rand = get_random_u32(); + u64 mult; + + /* + * This function is technically undefined for ceil == 0, and in fact + * for the non-underscored constant version in the header, we build bug + * on that. But for the non-constant case, it's convenient to have that + * evaluate to being a straight call to get_random_u32(), so that + * get_random_u32_inclusive() can work over its whole range without + * undefined behavior. + */ + if (unlikely(!ceil)) + return rand; + + mult = (u64)ceil * rand; + if (unlikely((u32)mult < ceil)) { + u32 bound = -ceil % ceil; + while (unlikely((u32)mult < bound)) + mult = (u64)ceil * get_random_u32(); + } + return mult >> 32; + } + EXPORT_SYMBOL(__get_random_u32_below); + #ifdef CONFIG_SMP /* * This function is called when the CPU is coming up, with entry @@@ -660,9 -716,10 +716,10 @@@ static void __cold _credit_init_bits(si } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));
if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { - crng_reseed(); /* Sets crng_init to CRNG_READY under base_crng.lock. */ + crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */ if (static_key_initialized) execute_in_process_context(crng_set_ready, &set_ready); + atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); pr_notice("crng init done\n"); @@@ -689,7 -746,7 +746,7 @@@ * the above entropy accumulation routines: * * void add_device_randomness(const void *buf, size_t len); - * void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy); + * void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after); * void add_bootloader_randomness(const void *buf, size_t len); * void add_vmfork_randomness(const void *unique_vm_id, size_t len); * void add_interrupt_randomness(int irq); @@@ -710,7 -767,7 +767,7 @@@ * * add_bootloader_randomness() is called by bootloader drivers, such as EFI * and device tree, and credits its input depending on whether or not the - * configuration option CONFIG_RANDOM_TRUST_BOOTLOADER is set. + * command line option 'random.trust_bootloader'. * * add_vmfork_randomness() adds a unique (but not necessarily secret) ID * representing the current instance of a VM to the pool, without crediting, @@@ -736,8 -793,8 +793,8 @@@ * **********************************************************************/
- static bool trust_cpu __initdata = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU); - static bool trust_bootloader __initdata = IS_ENABLED(CONFIG_RANDOM_TRUST_BOOTLOADER); + static bool trust_cpu __initdata = true; + static bool trust_bootloader __initdata = true; static int __init parse_trust_cpu(char *arg) { return kstrtobool(arg, &trust_cpu); @@@ -768,7 -825,7 +825,7 @@@ static int random_pm_notification(struc if (crng_ready() && (action == PM_RESTORE_PREPARE || (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) && !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) { - crng_reseed(); + crng_reseed(NULL); pr_notice("crng reseeded on system resumption\n"); } return 0; @@@ -791,13 -848,13 +848,13 @@@ void __init random_init_early(const cha #endif
for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { - longs = arch_get_random_seed_longs_early(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; continue; } - longs = arch_get_random_longs_early(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; @@@ -812,7 -869,7 +869,7 @@@
/* Reseed if already seeded by earlier phases. */ if (crng_ready()) - crng_reseed(); + crng_reseed(NULL); else if (trust_cpu) _credit_init_bits(arch_bits); } @@@ -840,7 -897,7 +897,7 @@@ void __init random_init(void
/* Reseed if already seeded by earlier phases. */ if (crng_ready()) - crng_reseed(); + crng_reseed(NULL);
WARN_ON(register_pm_notifier(&pm_notifier));
@@@ -869,11 -926,11 +926,11 @@@ void add_device_randomness(const void * EXPORT_SYMBOL(add_device_randomness);
/* - * Interface for in-kernel drivers of true hardware RNGs. - * Those devices may produce endless random bits and will be throttled - * when our pool is full. + * Interface for in-kernel drivers of true hardware RNGs. Those devices + * may produce endless random bits, so this function will sleep for + * some amount of time after, if the sleep_after parameter is true. */ - void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy) + void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after) { mix_pool_bytes(buf, len); credit_init_bits(entropy); @@@ -882,14 -939,14 +939,14 @@@ * Throttle writing to once every reseed interval, unless we're not yet * initialized or no entropy is credited. */ - if (!kthread_should_stop() && (crng_ready() || !entropy)) + if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy)) schedule_timeout_interruptible(crng_reseed_interval()); } EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);
/* - * Handle random seed passed by bootloader, and credit it if - * CONFIG_RANDOM_TRUST_BOOTLOADER is set. + * Handle random seed passed by bootloader, and credit it depending + * on the command line option 'random.trust_bootloader'. */ void __init add_bootloader_randomness(const void *buf, size_t len) { @@@ -910,7 -967,7 +967,7 @@@ void __cold add_vmfork_randomness(cons { add_device_randomness(unique_vm_id, len); if (crng_ready()) { - crng_reseed(); + crng_reseed(NULL); pr_notice("crng reseeded due to virtual machine fork\n"); } blocking_notifier_call_chain(&vmfork_chain, 0, NULL); @@@ -1176,66 -1233,102 +1233,102 @@@ void __cold rand_initialize_disk(struc struct entropy_timer_state { unsigned long entropy; struct timer_list timer; - unsigned int samples, samples_per_bit; + atomic_t samples; + unsigned int samples_per_bit; };
/* - * Each time the timer fires, we expect that we got an unpredictable - * jump in the cycle counter. Even if the timer is running on another - * CPU, the timer activity will be touching the stack of the CPU that is - * generating entropy.. + * Each time the timer fires, we expect that we got an unpredictable jump in + * the cycle counter. Even if the timer is running on another CPU, the timer + * activity will be touching the stack of the CPU that is generating entropy. * - * Note that we don't re-arm the timer in the timer itself - we are - * happy to be scheduled away, since that just makes the load more - * complex, but we do not want the timer to keep ticking unless the - * entropy loop is running. + * Note that we don't re-arm the timer in the timer itself - we are happy to be + * scheduled away, since that just makes the load more complex, but we do not + * want the timer to keep ticking unless the entropy loop is running. * * So the re-arming always happens in the entropy loop itself. */ static void __cold entropy_timer(struct timer_list *timer) { struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer); + unsigned long entropy = random_get_entropy();
- if (++state->samples == state->samples_per_bit) { + mix_pool_bytes(&entropy, sizeof(entropy)); + if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0) credit_init_bits(1); - state->samples = 0; - } }
/* - * If we have an actual cycle counter, see if we can - * generate enough entropy with timing noise + * If we have an actual cycle counter, see if we can generate enough entropy + * with timing noise. */ static void __cold try_to_generate_entropy(void) { enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 }; - struct entropy_timer_state stack; + u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1]; + struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES); unsigned int i, num_different = 0; unsigned long last = random_get_entropy(); + int cpu = -1;
for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) { - stack.entropy = random_get_entropy(); - if (stack.entropy != last) + stack->entropy = random_get_entropy(); + if (stack->entropy != last) ++num_different; - last = stack.entropy; + last = stack->entropy; } - stack.samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1); - if (stack.samples_per_bit > MAX_SAMPLES_PER_BIT) + stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1); + if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT) return;
- stack.samples = 0; - timer_setup_on_stack(&stack.timer, entropy_timer, 0); + atomic_set(&stack->samples, 0); + timer_setup_on_stack(&stack->timer, entropy_timer, 0); while (!crng_ready() && !signal_pending(current)) { - if (!timer_pending(&stack.timer)) - mod_timer(&stack.timer, jiffies); - mix_pool_bytes(&stack.entropy, sizeof(stack.entropy)); + /* + * Check !timer_pending() and then ensure that any previous callback has finished + * executing by checking try_to_del_timer_sync(), before queueing the next one. + */ + if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) { + struct cpumask timer_cpus; + unsigned int num_cpus; + + /* + * Preemption must be disabled here, both to read the current CPU number + * and to avoid scheduling a timer on a dead CPU. + */ + preempt_disable(); + + /* Only schedule callbacks on timer CPUs that are online. */ + cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask); + num_cpus = cpumask_weight(&timer_cpus); + /* In very bizarre case of misconfiguration, fallback to all online. */ + if (unlikely(num_cpus == 0)) { + timer_cpus = *cpu_online_mask; + num_cpus = cpumask_weight(&timer_cpus); + } + + /* Basic CPU round-robin, which avoids the current CPU. */ + do { + cpu = cpumask_next(cpu, &timer_cpus); + if (cpu == nr_cpumask_bits) + cpu = cpumask_first(&timer_cpus); + } while (cpu == smp_processor_id() && num_cpus > 1); + + /* Expiring the timer at `jiffies` means it's the next tick. */ + stack->timer.expires = jiffies; + + add_timer_on(&stack->timer, cpu); + + preempt_enable(); + } + mix_pool_bytes(&stack->entropy, sizeof(stack->entropy)); schedule(); - stack.entropy = random_get_entropy(); + stack->entropy = random_get_entropy(); } + mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
- del_timer_sync(&stack.timer); - destroy_timer_on_stack(&stack.timer); - mix_pool_bytes(&stack.entropy, sizeof(stack.entropy)); + del_timer_sync(&stack->timer); + destroy_timer_on_stack(&stack->timer); }
@@@ -1291,7 -1384,7 +1384,7 @@@ SYSCALL_DEFINE3(getrandom, char __user return ret; }
- ret = import_single_range(READ, ubuf, len, &iov, &iter); + ret = import_single_range(ITER_DEST, ubuf, len, &iov, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); @@@ -1409,7 -1502,7 +1502,7 @@@ static long random_ioctl(struct file *f return -EINVAL; if (get_user(len, p++)) return -EFAULT; - ret = import_single_range(WRITE, p, len, &iov, &iter); + ret = import_single_range(ITER_SOURCE, p, len, &iov, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); @@@ -1432,7 -1525,7 +1525,7 @@@ return -EPERM; if (!crng_ready()) return -ENODATA; - crng_reseed(); + crng_reseed(NULL); return 0; default: return -EINVAL; diff --combined drivers/firmware/efi/efi.c index 297bf3fd9acc,16dae588f0e3..31a4090c66b3 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@@ -58,8 -58,6 +58,8 @@@ static unsigned long __initdata mem_res static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; static unsigned long __initdata initrd = EFI_INVALID_TABLE_ADDR;
+extern unsigned long screen_info_table; + struct mm_struct efi_mm = { .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), @@@ -339,6 -337,24 +339,24 @@@ static void __init efi_debugfs_init(voi static inline void efi_debugfs_init(void) {} #endif
+ static void refresh_nv_rng_seed(struct work_struct *work) + { + u8 seed[EFI_RANDOM_SEED_SIZE]; + + get_random_bytes(seed, sizeof(seed)); + efi.set_variable(L"RandomSeed", &LINUX_EFI_RANDOM_SEED_TABLE_GUID, + EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, sizeof(seed), seed); + memzero_explicit(seed, sizeof(seed)); + } + static int refresh_nv_rng_seed_notification(struct notifier_block *nb, unsigned long action, void *data) + { + static DECLARE_WORK(work, refresh_nv_rng_seed); + schedule_work(&work); + return NOTIFY_DONE; + } + static struct notifier_block refresh_nv_rng_seed_nb = { .notifier_call = refresh_nv_rng_seed_notification }; + /* * We register the efi subsystem with the firmware subsystem and the * efivars subsystem with the efi subsystem, if the system was booted with @@@ -396,6 -412,10 +414,6 @@@ static int __init efisubsys_init(void goto err_unregister; }
- error = efi_runtime_map_init(efi_kobj); - if (error) - goto err_remove_group; - /* and the standard mountpoint for efivarfs */ error = sysfs_create_mount_point(efi_kobj, "efivars"); if (error) { @@@ -411,6 -431,7 +429,7 @@@ platform_device_register_simple("efi_secret", 0, NULL, 0); #endif
+ execute_with_initialized_rng(&refresh_nv_rng_seed_nb); return 0;
err_remove_group: @@@ -421,7 -442,6 +440,7 @@@ err_unregister generic_ops_unregister(); err_put: kobject_put(efi_kobj); + efi_kobj = NULL; destroy_workqueue(efi_rts_wq); return error; } @@@ -545,9 -565,6 +564,9 @@@ static const efi_config_table_type_t co #endif #ifdef CONFIG_EFI_COCO_SECRET {LINUX_EFI_COCO_SECRET_AREA_GUID, &efi.coco_secret, "CocoSecret" }, +#endif +#ifdef CONFIG_EFI_GENERIC_STUB + {LINUX_EFI_SCREEN_INFO_TABLE_GUID, &screen_info_table }, #endif {}, }; @@@ -613,7 -630,7 +632,7 @@@ int __init efi_config_parse_tables(cons
seed = early_memremap(efi_rng_seed, sizeof(*seed)); if (seed != NULL) { - size = min(seed->size, EFI_RANDOM_SEED_SIZE); + size = min_t(u32, seed->size, SZ_1K); // sanity check early_memunmap(seed, sizeof(*seed)); } else { pr_err("Could not map UEFI random seed!\n"); @@@ -622,8 -639,8 +641,8 @@@ seed = early_memremap(efi_rng_seed, sizeof(*seed) + size); if (seed != NULL) { - pr_notice("seeding entropy pool\n"); add_bootloader_randomness(seed->bits, size); + memzero_explicit(seed->bits, size); early_memunmap(seed, sizeof(*seed) + size); } else { pr_err("Could not map UEFI random seed!\n"); diff --combined drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 29e9e8d5b6fe,29d2459bcc90..da09767fda07 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@@ -30,7 -30,6 +30,7 @@@ #include "i915_gem_context.h" #include "i915_gem_evict.h" #include "i915_gem_ioctls.h" +#include "i915_reg.h" #include "i915_trace.h" #include "i915_user_extensions.h"
@@@ -54,13 -53,13 +54,13 @@@ enum #define DBG_FORCE_RELOC 0 /* choose one of the above! */ };
-/* __EXEC_OBJECT_NO_RESERVE is BIT(31), defined in i915_vma.h */ -#define __EXEC_OBJECT_HAS_PIN BIT(30) -#define __EXEC_OBJECT_HAS_FENCE BIT(29) -#define __EXEC_OBJECT_USERPTR_INIT BIT(28) -#define __EXEC_OBJECT_NEEDS_MAP BIT(27) -#define __EXEC_OBJECT_NEEDS_BIAS BIT(26) -#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 26) /* all of the above + */ +/* __EXEC_OBJECT_ flags > BIT(29) defined in i915_vma.h */ +#define __EXEC_OBJECT_HAS_PIN BIT(29) +#define __EXEC_OBJECT_HAS_FENCE BIT(28) +#define __EXEC_OBJECT_USERPTR_INIT BIT(27) +#define __EXEC_OBJECT_NEEDS_MAP BIT(26) +#define __EXEC_OBJECT_NEEDS_BIAS BIT(25) +#define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 25) /* all of the above + */ #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
#define __EXEC_HAS_RELOC BIT(31) @@@ -2102,8 -2101,7 +2102,8 @@@ static int eb_move_to_gpu(struct i915_e eb->composite_fence ? eb->composite_fence : &eb->requests[j]->fence, - flags | __EXEC_OBJECT_NO_RESERVE); + flags | __EXEC_OBJECT_NO_RESERVE | + __EXEC_OBJECT_NO_REQUEST_AWAIT); } }
@@@ -2150,8 -2148,7 +2150,8 @@@ err_skip return err; }
-static int i915_gem_check_execbuffer(struct drm_i915_gem_execbuffer2 *exec) +static int i915_gem_check_execbuffer(struct drm_i915_private *i915, + struct drm_i915_gem_execbuffer2 *exec) { if (exec->flags & __I915_EXEC_ILLEGAL_FLAGS) return -EINVAL; @@@ -2164,7 -2161,7 +2164,7 @@@ }
if (exec->DR4 == 0xffffffff) { - DRM_DEBUG("UXA submitting garbage DR4, fixing up\n"); + drm_dbg(&i915->drm, "UXA submitting garbage DR4, fixing up\n"); exec->DR4 = 0; } if (exec->DR1 || exec->DR4) @@@ -2427,7 -2424,7 +2427,7 @@@ gen8_dispatch_bsd_engine(struct drm_i91 /* Check whether the file_priv has already selected one ring. */ if ((int)file_priv->bsd_engine < 0) file_priv->bsd_engine = - prandom_u32_max(num_vcs_engines(dev_priv)); + get_random_u32_below(num_vcs_engines(dev_priv));
return file_priv->bsd_engine; } @@@ -2802,8 -2799,7 +2802,8 @@@ add_timeline_fence_array(struct i915_ex
syncobj = drm_syncobj_find(eb->file, user_fence.handle); if (!syncobj) { - DRM_DEBUG("Invalid syncobj handle provided\n"); + drm_dbg(&eb->i915->drm, + "Invalid syncobj handle provided\n"); return -ENOENT; }
@@@ -2811,8 -2807,7 +2811,8 @@@
if (!fence && user_fence.flags && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { - DRM_DEBUG("Syncobj handle has no fence\n"); + drm_dbg(&eb->i915->drm, + "Syncobj handle has no fence\n"); drm_syncobj_put(syncobj); return -EINVAL; } @@@ -2821,9 -2816,7 +2821,9 @@@ err = dma_fence_chain_find_seqno(&fence, point);
if (err && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) { - DRM_DEBUG("Syncobj handle missing requested point %llu\n", point); + drm_dbg(&eb->i915->drm, + "Syncobj handle missing requested point %llu\n", + point); dma_fence_put(fence); drm_syncobj_put(syncobj); return err; @@@ -2849,8 -2842,7 +2849,8 @@@ * 0) would break the timeline. */ if (user_fence.flags & I915_EXEC_FENCE_WAIT) { - DRM_DEBUG("Trying to wait & signal the same timeline point.\n"); + drm_dbg(&eb->i915->drm, + "Trying to wait & signal the same timeline point.\n"); dma_fence_put(fence); drm_syncobj_put(syncobj); return -EINVAL; @@@ -2921,16 -2913,14 +2921,16 @@@ static int add_fence_array(struct i915_
syncobj = drm_syncobj_find(eb->file, user_fence.handle); if (!syncobj) { - DRM_DEBUG("Invalid syncobj handle provided\n"); + drm_dbg(&eb->i915->drm, + "Invalid syncobj handle provided\n"); return -ENOENT; }
if (user_fence.flags & I915_EXEC_FENCE_WAIT) { fence = drm_syncobj_fence_get(syncobj); if (!fence) { - DRM_DEBUG("Syncobj handle has no fence\n"); + drm_dbg(&eb->i915->drm, + "Syncobj handle has no fence\n"); drm_syncobj_put(syncobj); return -EINVAL; } @@@ -2964,6 -2954,11 +2964,6 @@@ await_fence_array(struct i915_execbuffe int err;
for (n = 0; n < eb->num_fences; n++) { - struct drm_syncobj *syncobj; - unsigned int flags; - - syncobj = ptr_unpack_bits(eb->fences[n].syncobj, &flags, 2); - if (!eb->fences[n].dma_fence) continue;
@@@ -3525,7 -3520,7 +3525,7 @@@ i915_gem_execbuffer2_ioctl(struct drm_d return -EINVAL; }
- err = i915_gem_check_execbuffer(args); + err = i915_gem_check_execbuffer(i915, args); if (err) return err;
diff --combined drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 49a8f10d76c7,45b605e32c87..2daffa7c7dfd --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@@ -110,7 -110,6 +110,7 @@@ #include <linux/string_helpers.h>
#include "i915_drv.h" +#include "i915_reg.h" #include "i915_trace.h" #include "i915_vgpu.h" #include "gen8_engine_cs.h" @@@ -3472,9 -3471,9 +3472,9 @@@ logical_ring_default_vfuncs(struct inte
if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) { if (intel_engine_has_preemption(engine)) - engine->emit_bb_start = gen125_emit_bb_start; + engine->emit_bb_start = xehp_emit_bb_start; else - engine->emit_bb_start = gen125_emit_bb_start_noarb; + engine->emit_bb_start = xehp_emit_bb_start_noarb; } else { if (intel_engine_has_preemption(engine)) engine->emit_bb_start = gen8_emit_bb_start; @@@ -3690,7 -3689,7 +3690,7 @@@ static void virtual_engine_initial_hint * NB This does not force us to execute on this engine, it will just * typically be the first we inspect for submission. */ - swp = prandom_u32_max(ve->num_siblings); + swp = get_random_u32_below(ve->num_siblings); if (swp) swap(ve->siblings[swp], ve->siblings[0]); } @@@ -3922,7 -3921,6 +3922,7 @@@ static struct intel_context execlists_create_virtual(struct intel_engine_cs **siblings, unsigned int count, unsigned long flags) { + struct drm_i915_private *i915 = siblings[0]->i915; struct virtual_engine *ve; unsigned int n; int err; @@@ -3931,7 -3929,7 +3931,7 @@@ if (!ve) return ERR_PTR(-ENOMEM);
- ve->base.i915 = siblings[0]->i915; + ve->base.i915 = i915; ve->base.gt = siblings[0]->gt; ve->base.uncore = siblings[0]->uncore; ve->base.id = -1; @@@ -3990,9 -3988,8 +3990,9 @@@
GEM_BUG_ON(!is_power_of_2(sibling->mask)); if (sibling->mask & ve->base.mask) { - DRM_DEBUG("duplicate %s entry in load balancer\n", - sibling->name); + drm_dbg(&i915->drm, + "duplicate %s entry in load balancer\n", + sibling->name); err = -EINVAL; goto err_put; } @@@ -4026,9 -4023,8 +4026,9 @@@ */ if (ve->base.class != OTHER_CLASS) { if (ve->base.class != sibling->class) { - DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", - sibling->class, ve->base.class); + drm_dbg(&i915->drm, + "invalid mixing of engine class, sibling %d, already %d\n", + sibling->class, ve->base.class); err = -EINVAL; goto err_put; } diff --combined drivers/infiniband/core/cma.c index 13e0ab785baa,aacd6254df77..68721ff10255 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@@ -47,7 -47,7 +47,7 @@@ MODULE_LICENSE("Dual BSD/GPL") #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
static const char * const cma_events[] = { @@@ -3807,7 -3807,7 +3807,7 @@@ static int cma_alloc_any_port(enum rdma
inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32_max(remaining) + low; + rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; diff --combined drivers/infiniband/ulp/rtrs/rtrs-clt.c index 85656af2d313,ab75b690ad08..80abf45a197a --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@@ -966,7 -966,7 +966,7 @@@ static void rtrs_clt_init_req(struct rt refcount_set(&req->ref, 1); req->mp_policy = clt_path->clt->mp_policy;
- iov_iter_kvec(&iter, READ, vec, 1, usr_len); + iov_iter_kvec(&iter, ITER_SOURCE, vec, 1, usr_len); len = _copy_from_iter(req->iu->buf, usr_len, &iter); WARN_ON(len != usr_len);
@@@ -1064,8 -1064,10 +1064,8 @@@ static int rtrs_map_sg_fr(struct rtrs_c
/* Align the MR to a 4K page size to match the block virt boundary */ nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K); - if (nr < 0) - return nr; - if (nr < req->sg_cnt) - return -EINVAL; + if (nr != count) + return nr < 0 ? nr : -EINVAL; ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
return nr; @@@ -1515,7 -1517,7 +1515,7 @@@ static void rtrs_clt_err_recovery_work( rtrs_clt_stop_and_destroy_conns(clt_path); queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, msecs_to_jiffies(delay_ms + - prandom_u32_max(RTRS_RECONNECT_SEED))); + get_random_u32_below(RTRS_RECONNECT_SEED))); }
static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt, diff --combined drivers/md/bcache/request.c index 39c7b607f8aa,32e21ba64357..67a2e29e0b40 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@@ -244,7 -244,7 +244,7 @@@ static void bch_data_insert_start(struc trace_bcache_cache_insert(k); bch_keylist_push(&op->insert_keys);
- bio_set_op_attrs(n, REQ_OP_WRITE, 0); + n->bi_opf = REQ_OP_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio);
@@@ -401,7 -401,7 +401,7 @@@ static bool check_should_bypass(struct }
if (bypass_torture_test(dc)) { - if (prandom_u32_max(4) == 3) + if (get_random_u32_below(4) == 3) goto skip; else goto rescale; diff --combined drivers/media/test-drivers/vidtv/vidtv_demod.c index c2300b30407b,d60c6d16beea..b878db798686 --- a/drivers/media/test-drivers/vidtv/vidtv_demod.c +++ b/drivers/media/test-drivers/vidtv/vidtv_demod.c @@@ -188,11 -188,11 +188,11 @@@ static void vidtv_demod_update_stats(st * Also, usually, signal strength is a negative number in dBm. */ c->strength.stat[0].svalue = state->tuner_cnr; - c->strength.stat[0].svalue -= prandom_u32_max(state->tuner_cnr / 50); + c->strength.stat[0].svalue -= get_random_u32_below(state->tuner_cnr / 50); c->strength.stat[0].svalue -= 68000; /* Adjust to a better range */
c->cnr.stat[0].svalue = state->tuner_cnr; - c->cnr.stat[0].svalue -= prandom_u32_max(state->tuner_cnr / 50); + c->cnr.stat[0].svalue -= get_random_u32_below(state->tuner_cnr / 50); }
static int vidtv_demod_read_status(struct dvb_frontend *fe, @@@ -213,11 -213,11 +213,11 @@@
if (snr < cnr2qual->cnr_ok) { /* eventually lose the TS lock */ - if (prandom_u32_max(100) < config->drop_tslock_prob_on_low_snr) + if (get_random_u32_below(100) < config->drop_tslock_prob_on_low_snr) state->status = 0; } else { /* recover if the signal improves */ - if (prandom_u32_max(100) < + if (get_random_u32_below(100) < config->recover_tslock_prob_on_good_snr) state->status = FE_HAS_SIGNAL | FE_HAS_CARRIER | @@@ -412,7 -412,8 +412,7 @@@ static const struct i2c_device_id vidtv }; MODULE_DEVICE_TABLE(i2c, vidtv_demod_i2c_id_table);
-static int vidtv_demod_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int vidtv_demod_i2c_probe(struct i2c_client *client) { struct vidtv_tuner_config *config = client->dev.platform_data; struct vidtv_demod_state *state; @@@ -449,7 -450,7 +449,7 @@@ static struct i2c_driver vidtv_demod_i2 .name = "dvb_vidtv_demod", .suppress_bind_attrs = true, }, - .probe = vidtv_demod_i2c_probe, + .probe_new = vidtv_demod_i2c_probe, .remove = vidtv_demod_i2c_remove, .id_table = vidtv_demod_i2c_id_table, }; diff --combined drivers/mmc/core/core.c index c04f5016807b,a1efda85c6f2..426c7f66b349 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@@ -56,7 -56,7 +56,7 @@@ static const unsigned freqs[] = { 40000 /* * Enabling software CRCs on the data blocks can be a significant (30%) * performance cost, and for other reasons may not always be desired. - * So we allow it it to be disabled. + * So we allow it to be disabled. */ bool use_spi_crc = 1; module_param(use_spi_crc, bool, 0); @@@ -97,8 -97,8 +97,8 @@@ static void mmc_should_fail_request(str !should_fail(&host->fail_mmc_request, data->blksz * data->blocks)) return;
- data->error = data_errors[prandom_u32_max(ARRAY_SIZE(data_errors))]; - data->bytes_xfered = prandom_u32_max(data->bytes_xfered >> 9) << 9; + data->error = data_errors[get_random_u32_below(ARRAY_SIZE(data_errors))]; + data->bytes_xfered = get_random_u32_below(data->bytes_xfered >> 9) << 9; }
#else /* CONFIG_FAIL_MMC_REQUEST */ @@@ -142,7 -142,8 +142,7 @@@ void mmc_request_done(struct mmc_host * int err = cmd->error;
/* Flag re-tuning needed on CRC errors */ - if (cmd->opcode != MMC_SEND_TUNING_BLOCK && - cmd->opcode != MMC_SEND_TUNING_BLOCK_HS200 && + if (!mmc_op_tuning(cmd->opcode) && !host->retune_crc_disable && (err == -EILSEQ || (mrq->sbc && mrq->sbc->error == -EILSEQ) || (mrq->data && mrq->data->error == -EILSEQ) || @@@ -526,7 -527,7 +526,7 @@@ EXPORT_SYMBOL(mmc_cqe_post_req) * mmc_cqe_recovery - Recover from CQE errors. * @host: MMC host to recover * - * Recovery consists of stopping CQE, stopping eMMC, discarding the queue in + * Recovery consists of stopping CQE, stopping eMMC, discarding the queue * in eMMC, and discarding the queue in CQE. CQE must call * mmc_cqe_request_done() on all requests. An error is returned if the eMMC * fails to discard its queue. @@@ -1133,13 -1134,7 +1133,13 @@@ u32 mmc_select_voltage(struct mmc_host mmc_power_cycle(host, ocr); } else { bit = fls(ocr) - 1; - ocr &= 3 << bit; + /* + * The bit variable represents the highest voltage bit set in + * the OCR register. + * To keep a range of 2 values (e.g. 3.2V/3.3V and 3.3V/3.4V), + * we must shift the mask '3' with (bit - 1). + */ + ocr &= 3 << (bit - 1); if (bit != host->ios.vdd) dev_warn(mmc_dev(host), "exceeding card's volts\n"); } @@@ -1483,11 -1478,6 +1483,11 @@@ void mmc_init_erase(struct mmc_card *ca card->pref_erase = 0; }
+static bool is_trim_arg(unsigned int arg) +{ + return (arg & MMC_TRIM_OR_DISCARD_ARGS) && arg != MMC_DISCARD_ARG; +} + static unsigned int mmc_mmc_erase_timeout(struct mmc_card *card, unsigned int arg, unsigned int qty) { @@@ -1770,7 -1760,7 +1770,7 @@@ int mmc_erase(struct mmc_card *card, un !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_ER_EN)) return -EOPNOTSUPP;
- if (mmc_card_mmc(card) && (arg & MMC_TRIM_ARGS) && + if (mmc_card_mmc(card) && is_trim_arg(arg) && !(card->ext_csd.sec_feature_support & EXT_CSD_SEC_GB_CL_EN)) return -EOPNOTSUPP;
@@@ -1800,7 -1790,7 +1800,7 @@@ * identified by the card->eg_boundary flag. */ rem = card->erase_size - (from % card->erase_size); - if ((arg & MMC_TRIM_ARGS) && (card->eg_boundary) && (nr > rem)) { + if ((arg & MMC_TRIM_OR_DISCARD_ARGS) && card->eg_boundary && nr > rem) { err = mmc_do_erase(card, from, from + rem - 1, arg); from += rem; if ((err) || (to <= from)) diff --combined drivers/mmc/host/dw_mmc.c index a0e2fdbee690,6ef410053037..829af2c98a44 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@@ -334,7 -334,8 +334,7 @@@ static u32 dw_mci_prep_stop_abort(struc cmdr == MMC_READ_MULTIPLE_BLOCK || cmdr == MMC_WRITE_BLOCK || cmdr == MMC_WRITE_MULTIPLE_BLOCK || - cmdr == MMC_SEND_TUNING_BLOCK || - cmdr == MMC_SEND_TUNING_BLOCK_HS200 || + mmc_op_tuning(cmdr) || cmdr == MMC_GEN_CMD) { stop->opcode = MMC_STOP_TRANSMISSION; stop->arg = 0; @@@ -1362,7 -1363,7 +1362,7 @@@ static void __dw_mci_start_request(stru * is just about to roll over. * * We do this whole thing under spinlock and only if the - * command hasn't already completed (indicating the the irq + * command hasn't already completed (indicating the irq * already ran so we don't want the timeout). */ spin_lock_irqsave(&host->irq_lock, irqflags); @@@ -1857,7 -1858,7 +1857,7 @@@ static void dw_mci_start_fault_timer(st * Try to inject the error at random points during the data transfer. */ hrtimer_start(&host->fault_timer, - ms_to_ktime(prandom_u32_max(25)), + ms_to_ktime(get_random_u32_below(25)), HRTIMER_MODE_REL); }
diff --combined drivers/mtd/ubi/debug.c index 552b765af810,fcca6942dbdd..27168f511d6d --- a/drivers/mtd/ubi/debug.c +++ b/drivers/mtd/ubi/debug.c @@@ -504,7 -504,6 +504,7 @@@ int ubi_debugfs_init_dev(struct ubi_dev { unsigned long ubi_num = ubi->ubi_num; struct ubi_debug_info *d = &ubi->dbg; + umode_t mode = S_IRUSR | S_IWUSR; int n;
if (!IS_ENABLED(CONFIG_DEBUG_FS)) @@@ -519,41 -518,41 +519,41 @@@
d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
- d->dfs_chk_gen = debugfs_create_file("chk_gen", S_IWUSR, d->dfs_dir, + d->dfs_chk_gen = debugfs_create_file("chk_gen", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
- d->dfs_chk_io = debugfs_create_file("chk_io", S_IWUSR, d->dfs_dir, + d->dfs_chk_io = debugfs_create_file("chk_io", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
- d->dfs_chk_fastmap = debugfs_create_file("chk_fastmap", S_IWUSR, + d->dfs_chk_fastmap = debugfs_create_file("chk_fastmap", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
- d->dfs_disable_bgt = debugfs_create_file("tst_disable_bgt", S_IWUSR, + d->dfs_disable_bgt = debugfs_create_file("tst_disable_bgt", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
d->dfs_emulate_bitflips = debugfs_create_file("tst_emulate_bitflips", - S_IWUSR, d->dfs_dir, + mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
d->dfs_emulate_io_failures = debugfs_create_file("tst_emulate_io_failures", - S_IWUSR, d->dfs_dir, + mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
d->dfs_emulate_power_cut = debugfs_create_file("tst_emulate_power_cut", - S_IWUSR, d->dfs_dir, + mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
d->dfs_power_cut_min = debugfs_create_file("tst_emulate_power_cut_min", - S_IWUSR, d->dfs_dir, + mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
d->dfs_power_cut_max = debugfs_create_file("tst_emulate_power_cut_max", - S_IWUSR, d->dfs_dir, + mode, d->dfs_dir, (void *)ubi_num, &dfs_fops);
debugfs_create_file("detailed_erase_block_info", S_IRUSR, d->dfs_dir, @@@ -591,7 -590,7 +591,7 @@@ int ubi_dbg_power_cut(struct ubi_devic
if (ubi->dbg.power_cut_max > ubi->dbg.power_cut_min) { range = ubi->dbg.power_cut_max - ubi->dbg.power_cut_min; - ubi->dbg.power_cut_counter += prandom_u32_max(range); + ubi->dbg.power_cut_counter += get_random_u32_below(range); } return 0; } diff --combined drivers/net/ethernet/broadcom/cnic.c index ad74b488a80a,74bc053a2078..7926aaef8f0c --- a/drivers/net/ethernet/broadcom/cnic.c +++ b/drivers/net/ethernet/broadcom/cnic.c @@@ -1027,14 -1027,16 +1027,14 @@@ static int __cnic_alloc_uio_rings(struc
udev->l2_ring_size = pages * CNIC_PAGE_SIZE; udev->l2_ring = dma_alloc_coherent(&udev->pdev->dev, udev->l2_ring_size, - &udev->l2_ring_map, - GFP_KERNEL | __GFP_COMP); + &udev->l2_ring_map, GFP_KERNEL); if (!udev->l2_ring) return -ENOMEM;
udev->l2_buf_size = (cp->l2_rx_ring_size + 1) * cp->l2_single_buf_size; udev->l2_buf_size = CNIC_PAGE_ALIGN(udev->l2_buf_size); udev->l2_buf = dma_alloc_coherent(&udev->pdev->dev, udev->l2_buf_size, - &udev->l2_buf_map, - GFP_KERNEL | __GFP_COMP); + &udev->l2_buf_map, GFP_KERNEL); if (!udev->l2_buf) { __cnic_free_uio_rings(udev); return -ENOMEM; @@@ -4103,7 -4105,7 +4103,7 @@@ static int cnic_cm_alloc_mem(struct cni for (i = 0; i < MAX_CM_SK_TBL_SZ; i++) atomic_set(&cp->csk_tbl[i].ref_count, 0);
- port_id = prandom_u32_max(CNIC_LOCAL_PORT_RANGE); + port_id = get_random_u32_below(CNIC_LOCAL_PORT_RANGE); if (cnic_init_id_tbl(&cp->csk_port_tbl, CNIC_LOCAL_PORT_RANGE, CNIC_LOCAL_PORT_MIN, port_id)) { cnic_cm_free_mem(dev); diff --combined drivers/net/phy/at803x.c index d49965907561,b07513c61c35..22f4458274aa --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@@ -870,10 -870,8 +870,10 @@@ static int at803x_probe(struct phy_devi .wolopts = 0, };
- if (ccr < 0) + if (ccr < 0) { + ret = ccr; goto err; + } mode_cfg = ccr & AT803X_MODE_CFG_MASK;
switch (mode_cfg) { @@@ -1760,7 -1758,7 +1760,7 @@@ static int qca808x_phy_fast_retrain_con
static int qca808x_phy_ms_random_seed_set(struct phy_device *phydev) { - u16 seed_value = prandom_u32_max(QCA808X_MASTER_SLAVE_SEED_RANGE); + u16 seed_value = get_random_u32_below(QCA808X_MASTER_SLAVE_SEED_RANGE);
return at803x_debug_reg_mask(phydev, QCA808X_PHY_DEBUG_LOCAL_SEED, QCA808X_MASTER_SLAVE_SEED_CFG, diff --combined drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index af8843507f3d,c704ca752138..e975d10e6009 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@@ -1128,7 -1128,7 +1128,7 @@@ static void brcmf_p2p_afx_handler(struc if (afx_hdl->is_listen && afx_hdl->my_listen_chan) /* 100ms ~ 300ms */ err = brcmf_p2p_discover_listen(p2p, afx_hdl->my_listen_chan, - 100 * (1 + prandom_u32_max(3))); + 100 * get_random_u32_inclusive(1, 3)); else err = brcmf_p2p_act_frm_search(p2p, afx_hdl->peer_listen_chan);
@@@ -2424,12 -2424,8 +2424,12 @@@ int brcmf_p2p_del_vif(struct wiphy *wip brcmf_remove_interface(vif->ifp, true);
brcmf_cfg80211_arm_vif_event(cfg, NULL); - if (iftype != NL80211_IFTYPE_P2P_DEVICE) - p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION].vif = NULL; + if (iftype != NL80211_IFTYPE_P2P_DEVICE) { + if (vif == p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION].vif) + p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION].vif = NULL; + if (vif == p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION2].vif) + p2p->bss_idx[P2PAPI_BSSCFG_CONNECTION2].vif = NULL; + }
return err; } diff --combined drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c index 83abfe996138,3a7a44bb3c60..aa791dbc3066 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac-ctxt.c @@@ -788,40 -788,14 +788,40 @@@ static u32 iwl_mvm_find_ie_offset(u8 *b return ie - beacon; }
-u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct ieee80211_tx_info *info, - struct ieee80211_vif *vif) +static u8 iwl_mvm_mac_ctxt_get_lowest_rate(struct iwl_mvm *mvm, + struct ieee80211_tx_info *info, + struct ieee80211_vif *vif) { + struct ieee80211_supported_band *sband; + unsigned long basic = vif->bss_conf.basic_rates; + u16 lowest_cck = IWL_RATE_COUNT, lowest_ofdm = IWL_RATE_COUNT; u8 rate; - if (info->band == NL80211_BAND_2GHZ && !vif->p2p) - rate = IWL_FIRST_CCK_RATE; - else - rate = IWL_FIRST_OFDM_RATE; + u32 i; + + sband = mvm->hw->wiphy->bands[info->band]; + for_each_set_bit(i, &basic, BITS_PER_LONG) { + u16 hw = sband->bitrates[i].hw_value; + + if (hw >= IWL_FIRST_OFDM_RATE) { + if (lowest_ofdm > hw) + lowest_ofdm = hw; + } else if (lowest_cck > hw) { + lowest_cck = hw; + } + } + + if (info->band == NL80211_BAND_2GHZ && !vif->p2p) { + if (lowest_cck != IWL_RATE_COUNT) + rate = lowest_cck; + else if (lowest_ofdm != IWL_RATE_COUNT) + rate = lowest_ofdm; + else + rate = IWL_RATE_1M_INDEX; + } else if (lowest_ofdm != IWL_RATE_COUNT) { + rate = lowest_ofdm; + } else { + rate = IWL_RATE_6M_INDEX; + }
return rate; } @@@ -838,24 -812,6 +838,24 @@@ u16 iwl_mvm_mac_ctxt_get_beacon_flags(c return flags; }
+u8 iwl_mvm_mac_ctxt_get_beacon_rate(struct iwl_mvm *mvm, + struct ieee80211_tx_info *info, + struct ieee80211_vif *vif) +{ + struct ieee80211_supported_band *sband = + mvm->hw->wiphy->bands[info->band]; + u32 legacy = vif->bss_conf.beacon_tx_rate.control[info->band].legacy; + + /* if beacon rate was configured try using it */ + if (hweight32(legacy) == 1) { + u32 rate = ffs(legacy) - 1; + + return sband->bitrates[rate].hw_value; + } + + return iwl_mvm_mac_ctxt_get_lowest_rate(mvm, info, vif); +} + static void iwl_mvm_mac_ctxt_set_tx(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct sk_buff *beacon, @@@ -886,7 -842,7 +886,7 @@@ cpu_to_le32(BIT(mvm->mgmt_last_antenna_idx) << RATE_MCS_ANT_POS);
- rate = iwl_mvm_mac_ctxt_get_lowest_rate(info, vif); + rate = iwl_mvm_mac_ctxt_get_beacon_rate(mvm, info, vif);
tx->rate_n_flags |= cpu_to_le32(iwl_mvm_mac80211_idx_to_hwrate(mvm->fw, rate)); @@@ -970,7 -926,7 +970,7 @@@ static int iwl_mvm_mac_ctxt_send_beacon struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(beacon); struct iwl_mac_beacon_cmd beacon_cmd = {}; - u8 rate = iwl_mvm_mac_ctxt_get_lowest_rate(info, vif); + u8 rate = iwl_mvm_mac_ctxt_get_beacon_rate(mvm, info, vif); u16 flags; struct ieee80211_chanctx_conf *ctx; int channel; @@@ -1143,7 -1099,7 +1143,7 @@@ static void iwl_mvm_mac_ctxt_cmd_fill_a iwl_mvm_mac_ap_iterator, &data);
if (data.beacon_device_ts) { - u32 rand = prandom_u32_max(64 - 36) + 36; + u32 rand = get_random_u32_inclusive(36, 63); mvmvif->ap_beacon_time = data.beacon_device_ts + ieee80211_tu_to_usec(data.beacon_int * rand / 100); diff --combined drivers/pci/p2pdma.c index 27539770a613,5565f67d6537..86812d2073ea --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@@ -89,90 -89,6 +89,90 @@@ static ssize_t published_show(struct de } static DEVICE_ATTR_RO(published);
+static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + size_t len = vma->vm_end - vma->vm_start; + struct pci_p2pdma *p2pdma; + struct percpu_ref *ref; + unsigned long vaddr; + void *kaddr; + int ret; + + /* prevent private mappings from being established */ + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { + pci_info_ratelimited(pdev, + "%s: fail, attempted private mapping\n", + current->comm); + return -EINVAL; + } + + if (vma->vm_pgoff) { + pci_info_ratelimited(pdev, + "%s: fail, attempted mapping with non-zero offset\n", + current->comm); + return -EINVAL; + } + + rcu_read_lock(); + p2pdma = rcu_dereference(pdev->p2pdma); + if (!p2pdma) { + ret = -ENODEV; + goto out; + } + + kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref); + if (!kaddr) { + ret = -ENOMEM; + goto out; + } + + /* + * vm_insert_page() can sleep, so a reference is taken to mapping + * such that rcu_read_unlock() can be done before inserting the + * pages + */ + if (unlikely(!percpu_ref_tryget_live_rcu(ref))) { + ret = -ENODEV; + goto out_free_mem; + } + rcu_read_unlock(); + + for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { + ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr)); + if (ret) { + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len); + return ret; + } + percpu_ref_get(ref); + put_page(virt_to_page(kaddr)); + kaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + + percpu_ref_put(ref); + + return 0; +out_free_mem: + gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len); +out: + rcu_read_unlock(); + return ret; +} + +static struct bin_attribute p2pmem_alloc_attr = { + .attr = { .name = "allocate", .mode = 0660 }, + .mmap = p2pmem_alloc_mmap, + /* + * Some places where we want to call mmap (ie. python) will check + * that the file size is greater than the mmap size before allowing + * the mmap to continue. To work around this, just set the size + * to be very large. + */ + .size = SZ_1T, +}; + static struct attribute *p2pmem_attrs[] = { &dev_attr_size.attr, &dev_attr_available.attr, @@@ -180,32 -96,11 +180,32 @@@ NULL, };
+static struct bin_attribute *p2pmem_bin_attrs[] = { + &p2pmem_alloc_attr, + NULL, +}; + static const struct attribute_group p2pmem_group = { .attrs = p2pmem_attrs, + .bin_attrs = p2pmem_bin_attrs, .name = "p2pmem", };
+static void p2pdma_page_free(struct page *page) +{ + struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); + struct percpu_ref *ref; + + gen_pool_free_owner(pgmap->provider->p2pdma->pool, + (uintptr_t)page_to_virt(page), PAGE_SIZE, + (void **)&ref); + percpu_ref_put(ref); +} + +static const struct dev_pagemap_ops p2pdma_pgmap_ops = { + .page_free = p2pdma_page_free, +}; + static void pci_p2pdma_release(void *data) { struct pci_dev *pdev = data; @@@ -257,19 -152,6 +257,19 @@@ out return error; }
+static void pci_p2pdma_unmap_mappings(void *data) +{ + struct pci_dev *pdev = data; + + /* + * Removing the alloc attribute from sysfs will call + * unmap_mapping_range() on the inode, teardown any existing userspace + * mappings and prevent new ones from being created. + */ + sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + p2pmem_group.name); +} + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@@ -316,7 -198,6 +316,7 @@@ int pci_p2pdma_add_resource(struct pci_ pgmap->range.end = pgmap->range.start + size - 1; pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; + pgmap->ops = &p2pdma_pgmap_ops;
p2p_pgmap->provider = pdev; p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - @@@ -328,11 -209,6 +328,11 @@@ goto pgmap_free; }
+ error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, + pdev); + if (error) + goto pages_free; + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, pci_bus_address(pdev, bar) + offset, @@@ -797,7 -673,7 +797,7 @@@ struct pci_dev *pci_p2pmem_find_many(st }
if (dev_cnt) - pdev = pci_dev_get(closest_pdevs[prandom_u32_max(dev_cnt)]); + pdev = pci_dev_get(closest_pdevs[get_random_u32_below(dev_cnt)]);
for (i = 0; i < dev_cnt; i++) pci_dev_put(closest_pdevs[i]); diff --combined drivers/scsi/scsi_debug.c index b77035ddc944,a86db9761d00..cc6953809a24 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@@ -3785,7 -3785,7 +3785,7 @@@ static int resp_write_scat(struct scsi_ mk_sense_buffer(scp, ILLEGAL_REQUEST, INVALID_FIELD_IN_CDB, 0); return illegal_condition_result; } - lrdp = kzalloc(lbdof_blen, GFP_ATOMIC); + lrdp = kzalloc(lbdof_blen, GFP_ATOMIC | __GFP_NOWARN); if (lrdp == NULL) return SCSI_MLQUEUE_HOST_BUSY; if (sdebug_verbose) @@@ -4436,7 -4436,7 +4436,7 @@@ static int resp_verify(struct scsi_cmn if (ret) return ret;
- arr = kcalloc(lb_size, vnum, GFP_ATOMIC); + arr = kcalloc(lb_size, vnum, GFP_ATOMIC | __GFP_NOWARN); if (!arr) { mk_sense_buffer(scp, ILLEGAL_REQUEST, INSUFF_RES_ASC, INSUFF_RES_ASCQ); @@@ -4504,7 -4504,7 +4504,7 @@@ static int resp_report_zones(struct scs
rep_max_zones = (alloc_len - 64) >> ilog2(RZONES_DESC_HD);
- arr = kzalloc(alloc_len, GFP_ATOMIC); + arr = kzalloc(alloc_len, GFP_ATOMIC | __GFP_NOWARN); if (!arr) { mk_sense_buffer(scp, ILLEGAL_REQUEST, INSUFF_RES_ASC, INSUFF_RES_ASCQ); @@@ -5702,16 -5702,16 +5702,16 @@@ static int schedule_resp(struct scsi_cm u64 ns = jiffies_to_nsecs(delta_jiff);
if (sdebug_random && ns < U32_MAX) { - ns = prandom_u32_max((u32)ns); + ns = get_random_u32_below((u32)ns); } else if (sdebug_random) { ns >>= 12; /* scale to 4 usec precision */ if (ns < U32_MAX) /* over 4 hours max */ - ns = prandom_u32_max((u32)ns); + ns = get_random_u32_below((u32)ns); ns <<= 12; } kt = ns_to_ktime(ns); } else { /* ndelay has a 4.2 second max */ - kt = sdebug_random ? prandom_u32_max((u32)ndelay) : + kt = sdebug_random ? get_random_u32_below((u32)ndelay) : (u32)ndelay; if (ndelay < INCLUSIVE_TIMING_MAX_NS) { u64 d = ktime_get_boottime_ns() - ns_from_boot; @@@ -7323,12 -7323,8 +7323,12 @@@ static int sdebug_add_host_helper(int p dev_set_name(&sdbg_host->dev, "adapter%d", sdebug_num_hosts);
error = device_register(&sdbg_host->dev); - if (error) + if (error) { + spin_lock(&sdebug_host_list_lock); + list_del(&sdbg_host->host_list); + spin_unlock(&sdebug_host_list_lock); goto clean; + }
++sdebug_num_hosts; return 0; @@@ -7340,10 -7336,7 +7340,10 @@@ clean kfree(sdbg_devinfo->zstate); kfree(sdbg_devinfo); } - kfree(sdbg_host); + if (sdbg_host->dev.release) + put_device(&sdbg_host->dev); + else + kfree(sdbg_host); pr_warn("%s: failed, errno=%d\n", __func__, -error); return error; } diff --combined fs/ceph/inode.c index e6f25f2834ac,fb255988dee8..e22e631cb115 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@@ -126,7 -126,7 +126,7 @@@ const struct inode_operations ceph_file .setattr = ceph_setattr, .getattr = ceph_getattr, .listxattr = ceph_listxattr, - .get_acl = ceph_get_acl, + .get_inode_acl = ceph_get_acl, .set_acl = ceph_set_acl, };
@@@ -362,7 -362,7 +362,7 @@@ static int ceph_fill_fragtree(struct in if (nsplits != ci->i_fragtree_nsplits) { update = true; } else if (nsplits) { - i = prandom_u32_max(nsplits); + i = get_random_u32_below(nsplits); id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) update = true; @@@ -1909,7 -1909,7 +1909,7 @@@ static void ceph_do_invalidate_pages(st mutex_unlock(&ci->i_truncate_mutex); out: if (check) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0); }
/* @@@ -1969,7 -1969,7 +1969,7 @@@ retry mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0);
wake_up_all(&ci->i_cap_wq); } @@@ -1991,7 -1991,7 +1991,7 @@@ static void ceph_inode_work(struct work __ceph_do_pending_vmtruncate(inode);
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) - ceph_check_caps(ci, 0, NULL); + ceph_check_caps(ci, 0);
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) ceph_flush_snaps(ci, NULL); @@@ -2255,7 -2255,7 +2255,7 @@@ int ceph_setattr(struct user_namespace err = __ceph_setattr(inode, attr);
if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, dentry, attr->ia_mode);
return err; } @@@ -2417,10 -2417,10 +2417,10 @@@ static int statx_to_caps(u32 want, umod { int mask = 0;
- if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) + if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_AUTH_SHARED;
- if (want & (STATX_NLINK|STATX_CTIME)) { + if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) { /* * The link count for directories depends on inode->i_subdirs, * and that is only updated when Fs caps are held. @@@ -2431,10 -2431,11 +2431,10 @@@ mask |= CEPH_CAP_LINK_SHARED; }
- if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| - STATX_BLOCKS)) + if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_FILE_SHARED;
- if (want & (STATX_CTIME)) + if (want & (STATX_CTIME|STATX_CHANGE_COOKIE)) mask |= CEPH_CAP_XATTR_SHARED;
return mask; @@@ -2477,11 -2478,6 +2477,11 @@@ int ceph_getattr(struct user_namespace valid_mask |= STATX_BTIME; }
+ if (request_mask & STATX_CHANGE_COOKIE) { + stat->change_cookie = inode_peek_iversion_raw(inode); + valid_mask |= STATX_CHANGE_COOKIE; + } + if (ceph_snap(inode) == CEPH_NOSNAP) stat->dev = sb->s_dev; else @@@ -2496,7 -2492,7 +2496,7 @@@ struct inode *parent;
parent = ceph_lookup_inode(sb, ceph_ino(inode)); - if (!parent) + if (IS_ERR(parent)) return PTR_ERR(parent);
pci = ceph_inode(parent); @@@ -2523,8 -2519,6 +2523,8 @@@ stat->nlink = 1 + 1 + ci->i_subdirs; }
+ stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; + stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; stat->result_mask = request_mask & valid_mask; return err; } diff --combined fs/ext4/ialloc.c index 04cae21f75d5,9fc1af8e19a3..63f9bb6e8851 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@@ -465,7 -465,7 +465,7 @@@ static int find_group_orlov(struct supe ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else - parent_group = prandom_u32_max(ngroups); + parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); @@@ -870,7 -870,7 +870,7 @@@ static int ext4_xattr_credits_for_new_i struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT); + struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
if (IS_ERR(p)) return PTR_ERR(p); @@@ -1076,8 -1076,8 +1076,8 @@@ repeat_in_this_group
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); - handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, 0, + handle = __ext4_journal_start_sb(NULL, dir->i_sb, + line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); diff --combined fs/ext4/super.c index 72ead3b56706,63ef74eb8091..16a343e8047d --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@@ -540,7 -540,8 +540,7 @@@ static int ext4_journal_submit_inode_da if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else - ret = jbd2_journal_submit_inode_data_buffers(jinode); - + ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; }
@@@ -1205,8 -1206,7 +1205,8 @@@ static void ext4_put_super(struct super ext4_unregister_sysfs(sb);
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) - ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", + &sb->s_uuid);
ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); @@@ -1323,7 -1323,6 +1323,7 @@@ static struct inode *ext4_alloc_inode(s return NULL;
inode_set_iversion(&ei->vfs_inode, 1); + ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); INIT_LIST_HEAD(&ei->i_prealloc_list); atomic_set(&ei->i_prealloc_active, 0); @@@ -2248,7 -2247,7 +2248,7 @@@ static int ext4_parse_param(struct fs_c return -EINVAL; }
- error = fs_lookup_param(fc, param, 1, &path); + error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); @@@ -3779,7 -3778,7 +3779,7 @@@ cont_thread } if (!progress) { elr->lr_next_sched = jiffies + - prandom_u32_max(EXT4_DEF_LI_MAX_START_DELAY * HZ); + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (time_before(elr->lr_next_sched, next_wakeup)) next_wakeup = elr->lr_next_sched; @@@ -3926,8 -3925,7 +3926,7 @@@ static struct ext4_li_request *ext4_li_ * spread the inode table initialization requests * better. */ - elr->lr_next_sched = jiffies + prandom_u32_max( - EXT4_DEF_LI_MAX_START_DELAY * HZ); + elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; }
@@@ -5288,15 -5286,14 +5287,15 @@@ static int __ext4_fill_super(struct fs_ goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); + "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { @@@ -5657,9 -5654,8 +5656,9 @@@ static int ext4_fill_super(struct super descr = "out journal";
if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) - ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Quota mode: %s.", descr, ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "mounted filesystem %pU with%s. " + "Quota mode: %s.", &sb->s_uuid, descr, + ext4_quota_mode(sb));
/* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); @@@ -5726,7 -5722,7 +5725,7 @@@ static struct inode *ext4_get_journal_i
ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); - if (!S_ISREG(journal_inode->i_mode)) { + if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; @@@ -6614,8 -6610,8 +6613,8 @@@ static int ext4_reconfigure(struct fs_c if (ret < 0) return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted. Quota mode: %s.", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU. Quota mode: %s.", + &sb->s_uuid, ext4_quota_mode(sb));
return 0; } @@@ -6889,20 -6885,6 +6888,20 @@@ static int ext4_quota_on(struct super_b return err; }
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) +{ + switch (type) { + case USRQUOTA: + return qf_inum == EXT4_USR_QUOTA_INO; + case GRPQUOTA: + return qf_inum == EXT4_GRP_QUOTA_INO; + case PRJQUOTA: + return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; + default: + BUG(); + } +} + static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { @@@ -6919,16 -6901,9 +6918,16 @@@ if (!qf_inums[type]) return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) { + ext4_error(sb, "Bad quota inum: %lu, type: %d", + qf_inums[type], type); + return -EUCLEAN; + } + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { - ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); + ext4_error(sb, "Bad quota inode: %lu, type: %d", + qf_inums[type], type); return PTR_ERR(qf_inode); }
@@@ -6967,9 -6942,8 +6966,9 @@@ int ext4_enable_quotas(struct super_blo if (err) { ext4_warning(sb, "Failed to enable quota tracking " - "(type=%d, err=%d). Please run " - "e2fsck to fix.", type, err); + "(type=%d, err=%d, ino=%lu). " + "Please run e2fsck to fix.", type, + err, qf_inums[type]); for (type--; type >= 0; type--) { struct inode *inode;
@@@ -7056,7 -7030,8 +7055,7 @@@ static ssize_t ext4_quota_read(struct s len = i_size-off; toread = len; while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? - sb->s_blocksize - offset : toread; + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); diff --combined fs/f2fs/gc.c index f0c6506d8975,536d332d9e2e..1f46685b6d5d --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@@ -96,6 -96,16 +96,6 @@@ static int gc_thread_func(void *data * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. */ - if (sbi->gc_mode == GC_URGENT_HIGH) { - spin_lock(&sbi->gc_urgent_high_lock); - if (sbi->gc_urgent_high_remaining) { - sbi->gc_urgent_high_remaining--; - if (!sbi->gc_urgent_high_remaining) - sbi->gc_mode = GC_NORMAL; - } - spin_unlock(&sbi->gc_urgent_high_lock); - } - if (sbi->gc_mode == GC_URGENT_HIGH || sbi->gc_mode == GC_URGENT_MID) { wait_ms = gc_th->urgent_sleep_time; @@@ -152,15 -162,6 +152,15 @@@ do_gc /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi, true); next: + if (sbi->gc_mode != GC_NORMAL) { + spin_lock(&sbi->gc_remaining_trials_lock); + if (sbi->gc_remaining_trials) { + sbi->gc_remaining_trials--; + if (!sbi->gc_remaining_trials) + sbi->gc_mode = GC_NORMAL; + } + spin_unlock(&sbi->gc_remaining_trials_lock); + } sb_end_write(sbi->sb);
} while (!kthread_should_stop()); @@@ -171,10 -172,13 +171,10 @@@ int f2fs_start_gc_thread(struct f2fs_sb { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; - int err = 0;
gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) { - err = -ENOMEM; - goto out; - } + if (!gc_th) + return -ENOMEM;
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; @@@ -189,14 -193,12 +189,14 @@@ sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { - err = PTR_ERR(gc_th->f2fs_gc_task); + int err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); sbi->gc_thread = NULL; + return err; } -out: - return err; + + return 0; }
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) @@@ -280,7 -282,7 +280,7 @@@ static void select_policy(struct f2fs_s
/* let's select beginning hot/small space first in no_heap mode*/ if (f2fs_need_rand_seg(sbi)) - p->offset = prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec); else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; @@@ -1077,7 -1079,7 +1077,7 @@@ static bool is_alive(struct f2fs_sb_inf { struct page *node_page; nid_t nid; - unsigned int ofs_in_node, max_addrs; + unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr;
nid = le32_to_cpu(sum->nid); @@@ -1103,18 -1105,11 +1103,18 @@@ return false; }
- max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE : - DEF_ADDRS_PER_BLOCK; - if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u", - ofs_in_node, dni->ino, dni->nid, max_addrs); + if (IS_INODE(node_page)) { + base = offset_in_addr(F2FS_INODE(node_page)); + max_addrs = DEF_ADDRS_PER_INODE; + } else { + base = 0; + max_addrs = DEF_ADDRS_PER_BLOCK; + } + + if (base + ofs_in_node >= max_addrs) { + f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", + base, ofs_in_node, max_addrs, dni->ino, dni->nid); + f2fs_put_page(node_page, 1); return false; }
@@@ -1146,7 -1141,7 +1146,7 @@@ static int ra_data_block(struct inode * struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@@ -1164,7 -1159,7 +1164,7 @@@ if (!page) return -ENOMEM;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) { + if (f2fs_lookup_read_extent_cache(inode, index, &ei)) { dn.data_blkaddr = ei.blk + index - ei.fofs; if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { @@@ -1568,8 -1563,8 +1568,8 @@@ next_step continue; }
- data_page = f2fs_get_read_data_page(inode, - start_bidx, REQ_RAHEAD, true); + data_page = f2fs_get_read_data_page(inode, start_bidx, + REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); @@@ -1749,9 -1744,8 +1749,9 @@@ freed get_valid_blocks(sbi, segno, false) == 0) seg_freed++;
- if (__is_large_section(sbi) && segno + 1 < end_segno) - sbi->next_victim_seg[gc_type] = segno + 1; + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO; skip: f2fs_put_page(sum_page, 0); } @@@ -1904,7 -1898,9 +1904,7 @@@ int __init f2fs_create_garbage_collecti { victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", sizeof(struct victim_entry)); - if (!victim_entry_slab) - return -ENOMEM; - return 0; + return victim_entry_slab ? 0 : -ENOMEM; }
void f2fs_destroy_garbage_collection_cache(void) @@@ -2137,6 -2133,8 +2137,6 @@@ out_unlock if (err) return err;
- set_sbi_flag(sbi, SBI_IS_RESIZEFS); - freeze_super(sbi->sb); f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); @@@ -2152,7 -2150,6 +2152,7 @@@ if (err) goto out_err;
+ set_sbi_flag(sbi, SBI_IS_RESIZEFS); err = free_segment_range(sbi, secs, false); if (err) goto recover_out; @@@ -2176,7 -2173,6 +2176,7 @@@ f2fs_commit_super(sbi, false); } recover_out: + clear_sbi_flag(sbi, SBI_IS_RESIZEFS); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); @@@ -2189,5 -2185,6 +2189,5 @@@ out_err f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); thaw_super(sbi->sb); - clear_sbi_flag(sbi, SBI_IS_RESIZEFS); return err; } diff --combined fs/f2fs/segment.c index a9099a754dd2,b304692c0cf5..25ddea478fc1 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@@ -192,19 -192,14 +192,19 @@@ void f2fs_abort_atomic_write(struct ino if (!f2fs_is_atomic_file(inode)) return;
- if (clean) - truncate_inode_pages_final(inode->i_mapping); clear_inode_flag(fi->cow_inode, FI_COW_FILE); iput(fi->cow_inode); fi->cow_inode = NULL; release_atomic_write_cnt(inode); + clear_inode_flag(inode, FI_ATOMIC_COMMITTED); + clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); stat_dec_atomic_inode(inode); + + if (clean) { + truncate_inode_pages_final(inode->i_mapping); + f2fs_i_size_write(inode, fi->original_i_size); + } }
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@@ -262,19 -257,14 +262,19 @@@ static void __complete_revoke_list(stru bool revoke) { struct revoke_entry *cur, *tmp; + bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); + list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } + + if (!revoke && truncate) + f2fs_do_truncate_blocks(inode, 0, false); }
static int __f2fs_commit_atomic_write(struct inode *inode) @@@ -345,12 -335,10 +345,12 @@@ next }
out: - if (ret) + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; - else + } else { sbi->committed_atomic_block += fi->atomic_write_cnt; + set_inode_flag(inode, FI_ATOMIC_COMMITTED); + }
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
@@@ -449,14 -437,8 +449,14 @@@ void f2fs_balance_fs_bg(struct f2fs_sb_ return;
/* try to shrink extent cache when there is no enough memory */ - if (!f2fs_available_free_memory(sbi, EXTENT_CACHE)) - f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) + f2fs_shrink_read_extent_tree(sbi, + READ_EXTENT_CACHE_SHRINK_NUMBER); + + /* try to shrink age extent cache when there is no enough memory */ + if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) + f2fs_shrink_age_extent_tree(sbi, + AGE_EXTENT_CACHE_SHRINK_NUMBER);
/* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) @@@ -638,11 -620,12 +638,11 @@@ int f2fs_create_flush_cmd_control(struc { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; - int err = 0;
if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) - return err; + return 0; goto init_thread; }
@@@ -655,20 -638,19 +655,20 @@@ init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) - return err; + return 0;
init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { - err = PTR_ERR(fcc->f2fs_issue_flush); + int err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; }
- return err; + return 0; }
void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) @@@ -874,7 -856,7 +874,7 @@@ block_t f2fs_get_unusable_blocks(struc } mutex_unlock(&dirty_i->seglist_lock);
- unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE]; + unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; @@@ -1070,8 -1052,8 +1070,8 @@@ static void __init_discard_policy(struc dpolicy->io_aware = true; dpolicy->sync = false; dpolicy->ordered = true; - if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { - dpolicy->granularity = 1; + if (utilization(sbi) > dcc->discard_urgent_util) { + dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; @@@ -1086,7 -1068,7 +1086,7 @@@ } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ - dpolicy->granularity = 1; + dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } @@@ -1144,12 -1126,13 +1144,12 @@@ static int __submit_discard_cmd(struct if (time_to_inject(sbi, FAULT_DISCARD)) { f2fs_show_injection_info(sbi, FAULT_DISCARD); err = -EIO; - goto submit; - } - err = __blkdev_issue_discard(bdev, + } else { + err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); -submit: + } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) @@@ -1187,7 -1170,7 +1187,7 @@@
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1); + f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
lstart += len; start += len; @@@ -1359,13 -1342,13 +1359,13 @@@ static void __update_discard_tree_range } }
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi, +static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart;
if (!f2fs_bdev_support_discard(bdev)) - return 0; + return;
trace_f2fs_queue_discard(bdev, blkstart, blklen);
@@@ -1377,6 -1360,7 +1377,6 @@@ mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); - return 0; }
static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, @@@ -1464,7 -1448,7 +1464,7 @@@ retry if (i + 1 < dpolicy->granularity) break;
- if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) return __issue_discard_cmd_orderly(sbi, dpolicy);
pend_list = &dcc->pend_list[i]; @@@ -1661,9 -1645,6 +1661,9 @@@ bool f2fs_issue_discard_timeout(struct struct discard_policy dpolicy; bool dropped;
+ if (!atomic_read(&dcc->discard_cmd_cnt)) + return false; + __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); @@@ -1688,11 -1669,6 +1688,11 @@@ static int issue_discard_thread(void *d set_freezable();
do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); @@@ -1700,6 -1676,14 +1700,6 @@@ __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity);
- if (!atomic_read(&dcc->discard_cmd_cnt)) - wait_ms = dpolicy.max_interval; - - wait_event_interruptible_timeout(*q, - kthread_should_stop() || freezing(current) || - dcc->discard_wake, - msecs_to_jiffies(wait_ms)); - if (dcc->discard_wake) dcc->discard_wake = 0;
@@@ -1713,11 -1697,12 +1713,11 @@@ continue; if (kthread_should_stop()) return 0; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || + !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } - if (!atomic_read(&dcc->discard_cmd_cnt)) - continue;
sb_start_intwrite(sbi->sb);
@@@ -1732,8 -1717,6 +1732,8 @@@ } else { wait_ms = dpolicy.max_interval; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval;
sb_end_intwrite(sbi->sb);
@@@ -1777,8 -1760,7 +1777,8 @@@ static int __f2fs_issue_discard_zone(st }
/* For conventional zones, use regular discard if supported */ - return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + __queue_discard_cmd(sbi, bdev, lblkstart, blklen); + return 0; } #endif
@@@ -1789,8 -1771,7 +1789,8 @@@ static int __issue_discard_async(struc if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __queue_discard_cmd(sbi, bdev, blkstart, blklen); + __queue_discard_cmd(sbi, bdev, blkstart, blklen); + return 0; }
static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@@ -2044,10 -2025,8 +2044,10 @@@ int f2fs_start_discard_thread(struct f2
dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) + if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); + dcc->f2fs_issue_discard = NULL; + }
return err; } @@@ -2067,7 -2046,6 +2067,7 @@@ static int create_discard_cmd_control(s return -ENOMEM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) dcc->discard_granularity = sbi->blocks_per_seg; else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) @@@ -2088,7 -2066,6 +2088,7 @@@ dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; + dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; @@@ -2119,7 -2096,8 +2119,7 @@@ static void destroy_discard_cmd_control * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ - if (unlikely(atomic_read(&dcc->discard_cmd_cnt))) - f2fs_issue_discard_timeout(sbi); + f2fs_issue_discard_timeout(sbi);
kfree(dcc); SM_I(sbi)->dcc_info = NULL; @@@ -2556,7 -2534,7 +2556,7 @@@ static unsigned int __get_next_segno(st
sanity_check_seg_type(sbi, seg_type); if (f2fs_need_rand_seg(sbi)) - return prandom_u32_max(MAIN_SECS(sbi) * sbi->segs_per_sec); + return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
/* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@@ -2610,7 -2588,7 +2610,7 @@@ static void new_curseg(struct f2fs_sb_i curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); }
static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@@ -2647,9 -2625,9 +2647,9 @@@ static void __refresh_next_blkoff(struc /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk <= 0) { seg->fragment_remained_chunk = - prandom_u32_max(sbi->max_fragment_chunk) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += - prandom_u32_max(sbi->max_fragment_hole) + 1; + get_random_u32_inclusive(1, sbi->max_fragment_hole); } } } @@@ -2664,7 -2642,7 +2664,7 @@@ bool f2fs_segment_has_free_slot(struct * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@@ -2672,7 -2650,9 +2672,7 @@@ struct f2fs_summary_block *sum_node; struct page *sum_page;
- if (flush) - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
@@@ -2711,7 -2691,7 +2711,7 @@@ static void get_atssr_segment(struct f2 struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; @@@ -2855,20 -2835,31 +2855,20 @@@ static int get_ssr_segment(struct f2fs_ return 0; }
-/* - * flush out current segment and replace it with new segment - * This function should be returned with success, otherwise BUG - */ -static void allocate_segment_by_default(struct f2fs_sb_info *sbi, - int type, bool force) +static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (force) - new_curseg(sbi, type, true); - else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && - curseg->seg_type == CURSEG_WARM_NODE) - new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && - is_next_segment_free(sbi, curseg, type) && - likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) - new_curseg(sbi, type, false); - else if (f2fs_need_SSR(sbi) && - get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); - else - new_curseg(sbi, type, false); - - stat_inc_seg_type(sbi, curseg); + if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + curseg->seg_type == CURSEG_WARM_NODE) + return true; + if (curseg->alloc_type == LFS && + is_next_segment_free(sbi, curseg, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; + if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) + return true; + return false; }
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, @@@ -2886,7 -2877,7 +2886,7 @@@ goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, true);
@@@ -2921,8 -2912,7 +2921,8 @@@ static void __allocate_new_segment(stru return; alloc: old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + new_curseg(sbi, type, true); + stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); }
@@@ -2953,6 -2943,10 +2953,6 @@@ void f2fs_allocate_new_segments(struct f2fs_up_read(&SM_I(sbi)->curseg_lock); }
-static const struct segment_allocation default_salloc_ops = { - .allocate_segment = allocate_segment_by_default, -}; - bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { @@@ -3158,28 -3152,10 +3158,28 @@@ static int __get_segment_type_4(struct } }
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_info ei; + + if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { + if (!ei.age) + return NO_CHECK_TYPE; + if (ei.age <= sbi->hot_data_age_threshold) + return CURSEG_HOT_DATA; + if (ei.age <= sbi->warm_data_age_threshold) + return CURSEG_WARM_DATA; + return CURSEG_COLD_DATA; + } + return NO_CHECK_TYPE; +} + static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; + int type;
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; @@@ -3194,11 -3170,6 +3194,11 @@@ } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; + + type = __get_age_segment_type(inode, fio->page->index); + if (type != NO_CHECK_TYPE) + return type; + if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) @@@ -3295,19 -3266,11 +3295,19 @@@ void f2fs_allocate_data_block(struct f2 update_sit_entry(sbi, old_blkaddr, -1);
if (!__has_curseg_space(sbi, curseg)) { - if (from_gc) + /* + * Flush out current segment and replace it with new segment. + */ + if (from_gc) { get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); - else - sit_i->s_ops->allocate_segment(sbi, type, false); + } else { + if (need_new_seg(sbi, type)) + new_curseg(sbi, type, false); + else + change_curseg(sbi, type); + stat_inc_seg_type(sbi, curseg); + } } /* * segment dirty status should be updated after segment allocation, @@@ -3317,9 -3280,6 +3317,9 @@@ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
+ if (IS_DATASEG(type)) + atomic64_inc(&sbi->allocated_data_blocks); + up_write(&sit_i->sentry_lock);
if (page && IS_NODESEG(type)) { @@@ -3447,8 -3407,6 +3447,8 @@@ void f2fs_outplace_write_data(struct dn struct f2fs_summary sum;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); + if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) + f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@@ -3573,7 -3531,7 +3573,7 @@@ void f2fs_do_replace_block(struct f2fs_ /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); }
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@@ -3601,7 -3559,7 +3601,7 @@@ if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; @@@ -4298,6 -4256,9 +4298,6 @@@ static int build_sit_info(struct f2fs_s return -ENOMEM; #endif
- /* init SIT information */ - sit_i->s_ops = &default_salloc_ops; - sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; diff --combined fs/xfs/xfs_error.c index 713341d246d1,822e6a0e9d1a..ae082808cfed --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@@ -46,7 -46,7 +46,7 @@@ static unsigned int xfs_errortag_random XFS_RANDOM_REFCOUNT_FINISH_ONE, XFS_RANDOM_BMAP_FINISH_ONE, XFS_RANDOM_AG_RESV_CRITICAL, - XFS_RANDOM_DROP_WRITES, + 0, /* XFS_RANDOM_DROP_WRITES has been removed */ XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, @@@ -60,8 -60,6 +60,8 @@@ XFS_RANDOM_LARP, XFS_RANDOM_DA_LEAF_SPLIT, XFS_RANDOM_ATTR_LEAF_TO_NODE, + XFS_RANDOM_WB_DELAY_MS, + XFS_RANDOM_WRITE_DELAY_MS, };
struct xfs_errortag_attr { @@@ -164,6 -162,7 +164,6 @@@ XFS_ERRORTAG_ATTR_RW(refcount_continue_ XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); @@@ -177,8 -176,6 +177,8 @@@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); +XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); +XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@@ -209,6 -206,7 +209,6 @@@ XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), @@@ -222,8 -220,6 +222,8 @@@ XFS_ERRORTAG_ATTR_LIST(larp), XFS_ERRORTAG_ATTR_LIST(da_leaf_split), XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), + XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), + XFS_ERRORTAG_ATTR_LIST(write_delay_ms), NULL, }; ATTRIBUTE_GROUPS(xfs_errortag); @@@ -260,32 -256,6 +260,32 @@@ xfs_errortag_del kmem_free(mp->m_errortag); }
+static bool +xfs_errortag_valid( + unsigned int error_tag) +{ + if (error_tag >= XFS_ERRTAG_MAX) + return false; + + /* Error out removed injection types */ + if (error_tag == XFS_ERRTAG_DROP_WRITES) + return false; + return true; +} + +bool +xfs_errortag_enabled( + struct xfs_mount *mp, + unsigned int tag) +{ + if (!mp->m_errortag) + return false; + if (!xfs_errortag_valid(tag)) + return false; + + return mp->m_errortag[tag] != 0; +} + bool xfs_errortag_test( struct xfs_mount *mp, @@@ -307,11 -277,9 +307,11 @@@ if (!mp->m_errortag) return false;
- ASSERT(error_tag < XFS_ERRTAG_MAX); + if (!xfs_errortag_valid(error_tag)) + return false; + randfactor = mp->m_errortag[error_tag]; - if (!randfactor || prandom_u32_max(randfactor)) + if (!randfactor || get_random_u32_below(randfactor)) return false;
xfs_warn_ratelimited(mp, @@@ -325,7 -293,7 +325,7 @@@ xfs_errortag_get struct xfs_mount *mp, unsigned int error_tag) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL;
return mp->m_errortag[error_tag]; @@@ -337,7 -305,7 +337,7 @@@ xfs_errortag_set unsigned int error_tag, unsigned int tag_value) { - if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL;
mp->m_errortag[error_tag] = tag_value; @@@ -351,7 -319,7 +351,7 @@@ xfs_errortag_add { BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
- if (error_tag >= XFS_ERRTAG_MAX) + if (!xfs_errortag_valid(error_tag)) return -EINVAL;
return xfs_errortag_set(mp, error_tag, diff --combined kernel/bpf/core.c index 708e908d8807,38159f39e2af..6cca66b39d01 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@@ -34,7 -34,6 +34,7 @@@ #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> +#include <linux/bpf_mem_alloc.h>
#include <asm/barrier.h> #include <asm/unaligned.h> @@@ -61,9 -60,6 +61,9 @@@ #define CTX regs[BPF_REG_CTX] #define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; + /* No hurry in this branch * * Exported for the bpf jit load helper. @@@ -868,7 -864,8 +868,7 @@@ static struct bpf_prog_pack *alloc_new_ list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; }
@@@ -886,7 -883,8 +886,7 @@@ void *bpf_prog_pack_alloc(u32 size, bpf if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); - set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } @@@ -1034,7 -1032,7 +1034,7 @@@ bpf_jit_binary_alloc(unsigned int progl hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = prandom_u32_max(hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@@ -1096,7 -1094,7 +1096,7 @@@ bpf_jit_binary_pack_alloc(unsigned int
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = prandom_u32_max(hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; @@@ -2253,14 -2251,8 +2253,14 @@@ static void __bpf_prog_array_free_sleep { struct bpf_prog_array *progs;
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is + * no need to call kfree_rcu(), just call kfree() directly. + */ progs = container_of(rcu, struct bpf_prog_array, rcu); - kfree_rcu(progs, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(progs); + else + kfree_rcu(progs, rcu); }
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) @@@ -2748,18 -2740,6 +2748,18 @@@ int __weak bpf_arch_text_invalidate(voi return -ENOTSUPP; }
+#ifdef CONFIG_BPF_SYSCALL +static int __init bpf_global_ma_init(void) +{ + int ret; + + ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); + bpf_global_ma_set = !ret; + return ret; +} +late_initcall(bpf_global_ma_init); +#endif + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --combined kernel/fork.c index 6f59f13b8e6b,ec57cae58ff1..5eeae8e0f2ee --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -75,7 -75,6 +75,6 @@@ #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> - #include <linux/random.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@@ -97,6 -96,7 +96,7 @@@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> + #include <linux/stackprotector.h>
#include <asm/pgalloc.h> #include <linux/uaccess.h> @@@ -535,9 -535,6 +535,9 @@@ void put_task_stack(struct task_struct
void free_task(struct task_struct *tsk) { +#ifdef CONFIG_SECCOMP + WARN_ON_ONCE(tsk->seccomp.filter); +#endif release_user_cpus_ptr(tsk); scs_release(tsk);
@@@ -2046,6 -2043,15 +2046,6 @@@ static __latent_entropy struct task_str return ERR_PTR(-EINVAL); }
- /* - * If the new process will be in a different time namespace - * do not allow it to share VM or a thread group with the forking task. - */ - if (clone_flags & (CLONE_THREAD | CLONE_VM)) { - if (nsp->time_ns != nsp->time_ns_for_children) - return ERR_PTR(-EINVAL); - } - if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially @@@ -2400,6 -2406,12 +2400,6 @@@
spin_lock(¤t->sighand->siglock);
- /* - * Copy seccomp details explicitly here, in case they were changed - * before holding sighand lock. - */ - copy_seccomp(p); - rv_task_fork(p);
rseq_fork(p, clone_flags); @@@ -2416,14 -2428,6 +2416,14 @@@ goto bad_fork_cancel_cgroup; }
+ /* No more failure paths after this point. */ + + /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@@ -2588,6 -2592,11 +2588,6 @@@ struct task_struct * __init fork_idle(i return task; }
-struct mm_struct *copy_init_mm(void) -{ - return dup_mm(NULL, &init_mm); -} - /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. @@@ -3006,27 -3015,10 +3006,27 @@@ static void sighand_ctor(void *data init_waitqueue_head(&sighand->signalfd_wqh); }
-void __init proc_caches_init(void) +void __init mm_cache_init(void) { unsigned int mm_size;
+ /* + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). + */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + + mm_cachep = kmem_cache_create_usercopy("mm_struct", + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), + NULL); +} + +void __init proc_caches_init(void) +{ sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@@ -3044,6 -3036,19 +3044,6 @@@ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
- /* - * The mm_cpumask is located at the end of mm_struct, and is - * dynamically sized based on the maximum CPU number this system - * can have, taking hotplug into account (nr_cpu_ids). - */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); - - mm_cachep = kmem_cache_create_usercopy("mm_struct", - mm_size, ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, - offsetof(struct mm_struct, saved_auxv), - sizeof_field(struct mm_struct, saved_auxv), - NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); diff --combined lib/fault-inject.c index adb2f9355ee6,9f53408c545d..1421818c9ef7 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@@ -41,6 -41,9 +41,6 @@@ EXPORT_SYMBOL_GPL(setup_fault_attr)
static void fail_dump(struct fault_attr *attr) { - if (attr->no_warn) - return; - if (attr->verbose > 0 && __ratelimit(&attr->ratelimit_state)) { printk(KERN_NOTICE "FAULT_INJECTION: forcing a failure.\n" "name %pd, interval %lu, probability %lu, " @@@ -100,7 -103,7 +100,7 @@@ static inline bool fail_stacktrace(stru * http://www.nongnu.org/failmalloc/ */
-bool should_fail(struct fault_attr *attr, ssize_t size) +bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags) { if (in_task()) { unsigned int fail_nth = READ_ONCE(current->fail_nth); @@@ -136,26 -139,20 +136,26 @@@ return false; }
- if (attr->probability <= prandom_u32_max(100)) + if (attr->probability <= get_random_u32_below(100)) return false;
if (!fail_stacktrace(attr)) return false;
fail: - fail_dump(attr); + if (!(flags & FAULT_NOWARN)) + fail_dump(attr);
if (atomic_read(&attr->times) != -1) atomic_dec_not_zero(&attr->times);
return true; } + +bool should_fail(struct fault_attr *attr, ssize_t size) +{ + return should_fail_ex(attr, size, 0); +} EXPORT_SYMBOL_GPL(should_fail);
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS diff --combined lib/kobject.c index 6e0bf03f4f36,af1f5f2954d4..985ee1c4f2c6 --- a/lib/kobject.c +++ b/lib/kobject.c @@@ -25,7 -25,7 +25,7 @@@ * and thus @kobj should have a namespace tag associated with it. Returns * %NULL otherwise. */ -const void *kobject_namespace(struct kobject *kobj) +const void *kobject_namespace(const struct kobject *kobj) { const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);
@@@ -45,7 -45,7 +45,7 @@@ * representation of given kobject. Normally used to adjust ownership of * objects in a container. */ -void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { *uid = GLOBAL_ROOT_UID; *gid = GLOBAL_ROOT_GID; @@@ -94,10 -94,10 +94,10 @@@ static int create_dir(struct kobject *k return 0; }
-static int get_kobj_path_length(struct kobject *kobj) +static int get_kobj_path_length(const struct kobject *kobj) { int length = 1; - struct kobject *parent = kobj; + const struct kobject *parent = kobj;
/* walk up the ancestors until we hit the one pointing to the * root. @@@ -112,9 -112,9 +112,9 @@@ return length; }
-static void fill_kobj_path(struct kobject *kobj, char *path, int length) +static void fill_kobj_path(const struct kobject *kobj, char *path, int length) { - struct kobject *parent; + const struct kobject *parent;
--length; for (parent = kobj; parent; parent = parent->parent) { @@@ -136,7 -136,7 +136,7 @@@ * * Return: The newly allocated memory, caller must free with kfree(). */ -char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask) +char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask) { char *path; int len; @@@ -694,7 -694,7 +694,7 @@@ static void kobject_release(struct kre { struct kobject *kobj = container_of(kref, struct kobject, kref); #ifdef CONFIG_DEBUG_KOBJECT_RELEASE - unsigned long delay = HZ + HZ * prandom_u32_max(4); + unsigned long delay = HZ + HZ * get_random_u32_below(4); pr_info("kobject: '%s' (%p): %s, parent %p (delayed %ld)\n", kobject_name(kobj), kobj, __func__, kobj->parent, delay); INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup); @@@ -834,9 -834,6 +834,9 @@@ EXPORT_SYMBOL_GPL(kobj_sysfs_ops) /** * kset_register() - Initialize and add a kset. * @k: kset. + * + * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name() + * is freed, it can not be used any more. */ int kset_register(struct kset *k) { @@@ -847,12 -844,8 +847,12 @@@
kset_init(k); err = kobject_add_internal(&k->kobj); - if (err) + if (err) { + kfree_const(k->kobj.name); + /* Set it to NULL to avoid accessing bad pointer in callers. */ + k->kobj.name = NULL; return err; + } kobject_uevent(&k->kobj, KOBJ_ADD); return 0; } @@@ -907,7 -900,7 +907,7 @@@ static void kset_release(struct kobjec kfree(kset); }
-static void kset_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) +static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { if (kobj->parent) kobject_get_ownership(kobj->parent, uid, gid); @@@ -1039,7 -1032,7 +1039,7 @@@ int kobj_ns_type_registered(enum kobj_n return registered; }
-const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent) +const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent) { const struct kobj_ns_type_operations *ops = NULL;
@@@ -1049,7 -1042,7 +1049,7 @@@ return ops; }
-const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj) +const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj) { return kobj_child_ns_ops(kobj->parent); } diff --combined lib/sbitmap.c index 586deb333237,58de526ff051..1fcede228fa2 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@@ -21,7 -21,7 +21,7 @@@ static int init_alloc_hint(struct sbitm int i;
for_each_possible_cpu(i) - *per_cpu_ptr(sb->alloc_hint, i) = prandom_u32_max(depth); + *per_cpu_ptr(sb->alloc_hint, i) = get_random_u32_below(depth); } return 0; } @@@ -33,7 -33,7 +33,7 @@@ static inline unsigned update_alloc_hin
hint = this_cpu_read(*sb->alloc_hint); if (unlikely(hint >= depth)) { - hint = depth ? prandom_u32_max(depth) : 0; + hint = depth ? get_random_u32_below(depth) : 0; this_cpu_write(*sb->alloc_hint, hint); }
@@@ -434,8 -434,6 +434,8 @@@ int sbitmap_queue_init_node(struct sbit sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); atomic_set(&sbq->wake_index, 0); atomic_set(&sbq->ws_active, 0); + atomic_set(&sbq->completion_cnt, 0); + atomic_set(&sbq->wakeup_cnt, 0);
sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); if (!sbq->ws) { @@@ -443,21 -441,40 +443,21 @@@ return -ENOMEM; }
- for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + for (i = 0; i < SBQ_WAIT_QUEUES; i++) init_waitqueue_head(&sbq->ws[i].wait); - atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); - }
return 0; } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int wake_batch) -{ - int i; - - if (sbq->wake_batch != wake_batch) { - WRITE_ONCE(sbq->wake_batch, wake_batch); - /* - * Pairs with the memory barrier in sbitmap_queue_wake_up() - * to ensure that the batch size is updated before the wait - * counts. - */ - smp_mb(); - for (i = 0; i < SBQ_WAIT_QUEUES; i++) - atomic_set(&sbq->ws[i].wait_cnt, 1); - } -} - static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, unsigned int depth) { unsigned int wake_batch;
wake_batch = sbq_calc_wake_batch(sbq, depth); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + if (sbq->wake_batch != wake_batch) + WRITE_ONCE(sbq->wake_batch, wake_batch); }
void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, @@@ -471,8 -488,7 +471,8 @@@
wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES, min_batch, SBQ_WAKE_BATCH); - __sbitmap_queue_update_wake_batch(sbq, wake_batch); + + WRITE_ONCE(sbq->wake_batch, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
@@@ -560,56 -576,106 +560,56 @@@ void sbitmap_queue_min_shallow_depth(st } EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
-static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { int i, wake_index;
if (!atomic_read(&sbq->ws_active)) - return NULL; + return;
wake_index = atomic_read(&sbq->wake_index); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[wake_index];
- if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) { - if (wake_index != atomic_read(&sbq->wake_index)) - atomic_set(&sbq->wake_index, wake_index); - return ws; - } - + /* + * Advance the index before checking the current queue. + * It improves fairness, by ensuring the queue doesn't + * need to be fully emptied before trying to wake up + * from the next one. + */ wake_index = sbq_index_inc(wake_index); + + /* + * It is sufficient to wake up at least one waiter to + * guarantee forward progress. + */ + if (waitqueue_active(&ws->wait) && + wake_up_nr(&ws->wait, nr)) + break; }
- return NULL; + if (wake_index != atomic_read(&sbq->wake_index)) + atomic_set(&sbq->wake_index, wake_index); }
-static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr) +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) { - struct sbq_wait_state *ws; - unsigned int wake_batch; - int wait_cnt, cur, sub; - bool ret; + unsigned int wake_batch = READ_ONCE(sbq->wake_batch); + unsigned int wakeups;
- if (*nr <= 0) - return false; + if (!atomic_read(&sbq->ws_active)) + return;
- ws = sbq_wake_ptr(sbq); - if (!ws) - return false; + atomic_add(nr, &sbq->completion_cnt); + wakeups = atomic_read(&sbq->wakeup_cnt);
- cur = atomic_read(&ws->wait_cnt); do { - /* - * For concurrent callers of this, callers should call this - * function again to wakeup a new batch on a different 'ws'. - */ - if (cur == 0) - return true; - sub = min(*nr, cur); - wait_cnt = cur - sub; - } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt)); - - /* - * If we decremented queue without waiters, retry to avoid lost - * wakeups. - */ - if (wait_cnt > 0) - return !waitqueue_active(&ws->wait); - - *nr -= sub; - - /* - * When wait_cnt == 0, we have to be particularly careful as we are - * responsible to reset wait_cnt regardless whether we've actually - * woken up anybody. But in case we didn't wakeup anybody, we still - * need to retry. - */ - ret = !waitqueue_active(&ws->wait); - wake_batch = READ_ONCE(sbq->wake_batch); - - /* - * Wake up first in case that concurrent callers decrease wait_cnt - * while waitqueue is empty. - */ - wake_up_nr(&ws->wait, wake_batch); + if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch) + return; + } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt, + &wakeups, wakeups + wake_batch));
- /* - * Pairs with the memory barrier in sbitmap_queue_resize() to - * ensure that we see the batch size update before the wait - * count is reset. - * - * Also pairs with the implicit barrier between decrementing wait_cnt - * and checking for waitqueue_active() to make sure waitqueue_active() - * sees result of the wakeup if atomic_dec_return() has seen the result - * of atomic_set(). - */ - smp_mb__before_atomic(); - - /* - * Increase wake_index before updating wait_cnt, otherwise concurrent - * callers can see valid wait_cnt in old waitqueue, which can cause - * invalid wakeup on the old waitqueue. - */ - sbq_index_atomic_inc(&sbq->wake_index); - atomic_set(&ws->wait_cnt, wake_batch); - - return ret || *nr; -} - -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr) -{ - while (__sbq_wake_up(sbq, &nr)) - ; + __sbitmap_queue_wake_up(sbq, wake_batch); } EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
@@@ -726,7 -792,9 +726,7 @@@ void sbitmap_queue_show(struct sbitmap_ seq_puts(m, "ws={\n"); for (i = 0; i < SBQ_WAIT_QUEUES; i++) { struct sbq_wait_state *ws = &sbq->ws[i]; - - seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", - atomic_read(&ws->wait_cnt), + seq_printf(m, "\t{.wait=%s},\n", waitqueue_active(&ws->wait) ? "active" : "inactive"); } seq_puts(m, "}\n"); diff --combined lib/test_printf.c index d6a5d4b5f884,f098914a48d5..d34dc636b81c --- a/lib/test_printf.c +++ b/lib/test_printf.c @@@ -126,7 -126,7 +126,7 @@@ __test(const char *expect, int elen, co * be able to print it as expected. */ failed_tests += do_test(BUF_SIZE, expect, elen, fmt, ap); - rand = 1 + prandom_u32_max(elen+1); + rand = get_random_u32_inclusive(1, elen + 1); /* Since elen < BUF_SIZE, we have 1 <= rand <= BUF_SIZE. */ failed_tests += do_test(rand, expect, elen, fmt, ap); failed_tests += do_test(0, expect, elen, fmt, ap); @@@ -179,6 -179,18 +179,6 @@@ test_number(void * behaviour. */ test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0); -#ifndef __CHAR_UNSIGNED__ - { - /* - * Passing a 'char' to a %02x specifier doesn't do - * what was presumably the intention when char is - * signed and the value is negative. One must either & - * with 0xff or cast to u8. - */ - char val = -16; - test("0xfffffff0|0xf0|0xf0", "%#02x|%#02x|%#02x", val, val & 0xff, (u8)val); - } -#endif }
static void __init @@@ -692,29 -704,31 +692,29 @@@ flags(void
static void __init fwnode_pointer(void) { - const struct software_node softnodes[] = { - { .name = "first", }, - { .name = "second", .parent = &softnodes[0], }, - { .name = "third", .parent = &softnodes[1], }, - { NULL /* Guardian */ } - }; - const char * const full_name = "first/second/third"; + const struct software_node first = { .name = "first" }; + const struct software_node second = { .name = "second", .parent = &first }; + const struct software_node third = { .name = "third", .parent = &second }; + const struct software_node *group[] = { &first, &second, &third, NULL }; const char * const full_name_second = "first/second"; + const char * const full_name_third = "first/second/third"; const char * const second_name = "second"; const char * const third_name = "third"; int rval;
- rval = software_node_register_nodes(softnodes); + rval = software_node_register_node_group(group); if (rval) { pr_warn("cannot register softnodes; rval %d\n", rval); return; }
- test(full_name_second, "%pfw", software_node_fwnode(&softnodes[1])); - test(full_name, "%pfw", software_node_fwnode(&softnodes[2])); - test(full_name, "%pfwf", software_node_fwnode(&softnodes[2])); - test(second_name, "%pfwP", software_node_fwnode(&softnodes[1])); - test(third_name, "%pfwP", software_node_fwnode(&softnodes[2])); + test(full_name_second, "%pfw", software_node_fwnode(&second)); + test(full_name_third, "%pfw", software_node_fwnode(&third)); + test(full_name_third, "%pfwf", software_node_fwnode(&third)); + test(second_name, "%pfwP", software_node_fwnode(&second)); + test(third_name, "%pfwP", software_node_fwnode(&third));
- software_node_unregister_nodes(softnodes); + software_node_unregister_node_group(group); }
static void __init fourcc_pointer(void) diff --combined lib/test_rhashtable.c index 3ae3399f3651,6a8e445c8b55..c20f6cb4bf55 --- a/lib/test_rhashtable.c +++ b/lib/test_rhashtable.c @@@ -368,8 -368,8 +368,8 @@@ static int __init test_rhltable(unsigne
pr_info("test %d random rhlist add/delete operations\n", entries); for (j = 0; j < entries; j++) { - u32 i = prandom_u32_max(entries); - u32 prand = prandom_u32_max(4); + u32 i = get_random_u32_below(entries); + u32 prand = get_random_u32_below(4);
cond_resched();
@@@ -396,7 -396,7 +396,7 @@@ }
if (prand & 2) { - i = prandom_u32_max(entries); + i = get_random_u32_below(entries); if (test_bit(i, obj_in_table)) { err = rhltable_remove(&rhlt, &rhl_test_objects[i].list_node, test_rht_params); WARN(err, "cannot remove element at slot %d", i); @@@ -434,7 -434,7 +434,7 @@@ out_free static int __init test_rhashtable_max(struct test_obj *array, unsigned int entries) { - unsigned int i, insert_retries = 0; + unsigned int i; int err;
test_rht_params.max_size = roundup_pow_of_two(entries / 8); @@@ -447,7 -447,9 +447,7 @@@
obj->value.id = i * 2; err = insert_retry(&ht, obj, test_rht_params); - if (err > 0) - insert_retries += err; - else if (err) + if (err < 0) return err; }
diff --combined lib/vsprintf.c index 5b0611c00956,2d11541ee561..be71a03c936a --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@@ -41,6 -41,7 +41,7 @@@ #include <linux/siphash.h> #include <linux/compiler.h> #include <linux/property.h> + #include <linux/notifier.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> #endif @@@ -752,26 -753,21 +753,21 @@@ early_param("debug_boot_weak_hash", deb
static bool filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; - static void fill_ptr_key_workfn(struct work_struct *work); - static DECLARE_DELAYED_WORK(fill_ptr_key_work, fill_ptr_key_workfn);
- static void fill_ptr_key_workfn(struct work_struct *work) + static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data) { - if (!rng_is_initialized()) { - queue_delayed_work(system_unbound_wq, &fill_ptr_key_work, HZ * 2); - return; - } - get_random_bytes(&ptr_key, sizeof(ptr_key));
/* Pairs with smp_rmb() before reading ptr_key. */ smp_wmb(); WRITE_ONCE(filled_random_ptr_key, true); + return NOTIFY_DONE; }
static int __init vsprintf_init_hashval(void) { - fill_ptr_key_workfn(NULL); + static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key }; + execute_with_initialized_rng(&fill_ptr_key_nb); return 0; } subsys_initcall(vsprintf_init_hashval) @@@ -866,7 -862,7 +862,7 @@@ char *restricted_pointer(char *buf, cha * kptr_restrict==1 cannot be used in IRQ context * because its test for CAP_SYSLOG would be meaningless. */ - if (in_irq() || in_serving_softirq() || in_nmi()) { + if (in_hardirq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); return error_string(buf, end, "pK-error", spec); diff --combined mm/slub.c index cf98455fb3b1,7cd2c657030a..2248f85e8167 --- a/mm/slub.c +++ b/mm/slub.c @@@ -39,7 -39,6 +39,7 @@@ #include <linux/memcontrol.h> #include <linux/random.h> #include <kunit/test.h> +#include <kunit/test-bug.h> #include <linux/sort.h>
#include <linux/debugfs.h> @@@ -188,12 -187,6 +188,12 @@@ do { #define USE_LOCKLESS_FAST_PATH() (false) #endif
+#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); @@@ -248,7 -241,6 +248,7 @@@ static inline bool kmem_cache_has_cpu_p /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG
+#ifndef CONFIG_SLUB_TINY /* * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@@ -261,10 -253,6 +261,10 @@@ * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif
#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@@ -310,7 -298,7 +310,7 @@@ struct track
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); #else @@@ -344,12 -332,10 +344,12 @@@ static inline void stat(const struct km */ static nodemask_t slab_nodes;
+#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; +#endif
/******************************************************************** * Core slab cache functions @@@ -395,12 -381,10 +395,12 @@@ static inline void *get_freepointer(str return freelist_dereference(s, object + s->offset); }
+#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } +#endif
/* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@@ -619,7 -603,7 +619,7 @@@ static bool slab_add_kunit_errors(void { struct kunit_resource *resource;
- if (likely(!current->kunit_test)) + if (!kunit_get_current_test()) return false;
resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); @@@ -845,17 -829,6 +845,17 @@@ static inline void set_orig_size(struc if (!slub_debug_orig_size(s)) return;
+#ifdef CONFIG_KASAN_GENERIC + /* + * KASAN could save its free meta data in object's data area at + * offset 0, if the size is larger than 'orig_size', it will + * overlap the data redzone in [orig_size+1, object_size], and + * the check should be skipped. + */ + if (kasan_metadata_size(s, true) > orig_size) + orig_size = s->object_size; +#endif + p += get_info_end(s); p += sizeof(struct track) * 2;
@@@ -875,11 -848,6 +875,11 @@@ static inline unsigned int get_orig_siz return *(unsigned int *)p; }
+void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@@ -942,7 -910,7 +942,7 @@@ static void print_trailer(struct kmem_c if (slub_debug_orig_size(s)) off += sizeof(unsigned int);
- off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false);
if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ @@@ -998,28 -966,17 +998,28 @@@ static __printf(3, 4) void slab_err(str static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size;
- if (s->flags & SLAB_RED_ZONE) + if (s->flags & SLAB_RED_ZONE) { memset(p - s->red_left_pad, val, s->red_left_pad);
+ if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } + if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset(p, POISON_FREE, poison_size - 1); + p[poison_size - 1] = POISON_END; }
if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset(p + poison_size, val, s->inuse - poison_size); }
static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@@ -1113,7 -1070,7 +1113,7 @@@ static int check_pad_bytes(struct kmem_ off += sizeof(unsigned int); }
- off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false);
if (size_from_object(s) == off) return 1; @@@ -1163,7 -1120,6 +1163,7 @@@ static int check_object(struct kmem_cac { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size;
if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", @@@ -1173,17 -1129,6 +1173,17 @@@ if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size)) { + return 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, slab, p, "Alignment padding", @@@ -1418,7 -1363,7 +1418,7 @@@ static inline int alloc_consistency_che return 1; }
-static noinline int alloc_debug_processing(struct kmem_cache *s, +static noinline bool alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@@ -1430,7 -1375,7 +1430,7 @@@ trace(s, slab, object, 1); set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); - return 1; + return true;
bad: if (folio_test_slab(slab_folio(slab))) { @@@ -1443,7 -1388,7 +1443,7 @@@ slab->inuse = slab->objects; slab->freelist = NULL; } - return 0; + return false; }
static inline int free_consistency_checks(struct kmem_cache *s, @@@ -1696,17 -1641,17 +1696,17 @@@ static inline void setup_object_debug(s static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
-static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, int orig_size) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; }
-static inline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) {} +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; }
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(void) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, @@@ -1731,13 -1676,11 +1731,13 @@@ static inline void inc_slabs_node(struc static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {}
+#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } +#endif #endif /* CONFIG_SLUB_DEBUG */
/* @@@ -1857,8 -1800,6 +1857,8 @@@ static inline struct slab *alloc_slab_p
slab = folio_slab(folio); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); if (page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab);
@@@ -1940,7 -1881,7 +1940,7 @@@ static bool shuffle_freelist(struct kme return false;
freelist_count = oo_objects(s->oo); - pos = prandom_u32_max(freelist_count); + pos = get_random_u32_below(freelist_count);
page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); @@@ -2058,11 -1999,17 +2058,11 @@@ static void __free_slab(struct kmem_cac int order = folio_order(folio); int pages = 1 << order;
- if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { - void *p; - - slab_pad_check(s, slab); - for_each_object(p, s, slab_address(slab), slab->objects) - check_object(s, slab, p, SLUB_RED_INACTIVE); - } - __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab(slab, order, s); @@@ -2078,17 -2025,9 +2078,17 @@@ static void rcu_free_slab(struct rcu_he
static void free_slab(struct kmem_cache *s, struct slab *slab) { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); - } else + else __free_slab(s, slab); }
@@@ -2275,7 -2214,7 +2275,7 @@@ static void *get_partial_node(struct km if (!pfmemalloc_match(slab, pc->flags)) continue;
- if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) @@@ -2390,8 -2329,6 +2390,8 @@@ static void *get_partial(struct kmem_ca return get_any_partial(s, pc); }
+#ifndef CONFIG_SLUB_TINY + #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@@ -2405,7 -2342,7 +2405,7 @@@ * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */
static inline unsigned long next_tid(unsigned long tid) { @@@ -2474,7 -2411,7 +2474,7 @@@ static void init_kmem_cache_cpus(struc static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; + enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; enum slab_modes mode = M_NONE; @@@ -2550,6 -2487,14 +2550,6 @@@ redo * acquire_slab() will see a slab that is frozen */ spin_lock_irqsave(&n->list_lock, flags); - } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { - mode = M_FULL; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } @@@ -2559,7 -2504,7 +2559,7 @@@ old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) { - if (mode == M_PARTIAL || mode == M_FULL) + if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } @@@ -2573,6 -2518,10 +2573,6 @@@ stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL) { - add_full(s, n, slab); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, DEACTIVATE_FULL); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } @@@ -2854,13 -2803,6 +2854,13 @@@ static int slub_cpu_dead(unsigned int c return 0; }
+#else /* CONFIG_SLUB_TINY */ +static inline void flush_all_cpus_locked(struct kmem_cache *s) { } +static inline void flush_all(struct kmem_cache *s) { } +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline int slub_cpu_dead(unsigned int cpu) { return 0; } +#endif /* CONFIG_SLUB_TINY */ + /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@@ -2886,28 -2828,38 +2886,28 @@@ static inline unsigned long node_nr_obj }
/* Supports checking bulk free of a constructed freelist */ -static noinline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - struct slab *slab_free = NULL; + bool checks_ok = false; void *object = head; int cnt = 0; - unsigned long flags; - bool checks_ok = false; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags);
if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, slab)) goto out; }
- if (slab->inuse < bulk_cnt) { + if (slab->inuse < *bulk_cnt) { slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", - slab->inuse, bulk_cnt); + slab->inuse, *bulk_cnt); goto out; }
next_object:
- if (++cnt > bulk_cnt) + if (++cnt > *bulk_cnt) goto out_cnt;
if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@@ -2929,22 -2881,61 +2929,22 @@@ checks_ok = true;
out_cnt: - if (cnt != bulk_cnt) + if (cnt != *bulk_cnt) { slab_err(s, slab, "Bulk free expected %d objects but found %d\n", - bulk_cnt, cnt); - -out: - if (checks_ok) { - void *prior = slab->freelist; - - /* Perform the actual freeing while we still hold the locks */ - slab->inuse -= cnt; - set_freepointer(s, tail, prior); - slab->freelist = head; - - /* - * If the slab is empty, and node's partial list is full, - * it should be discarded anyway no matter it's on full or - * partial list. - */ - if (slab->inuse == 0 && n->nr_partial >= s->min_partial) - slab_free = slab; - - if (!prior) { - /* was on full list */ - remove_full(s, n, slab); - if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } else if (slab_free) { - remove_partial(n, slab); - stat(s, FREE_REMOVE_PARTIAL); - } + *bulk_cnt, cnt); + *bulk_cnt = cnt; }
- if (slab_free) { - /* - * Update the counters while still holding n->list_lock to - * prevent spurious validation warnings - */ - dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); - } - - spin_unlock_irqrestore(&n->list_lock, flags); +out:
if (!checks_ok) slab_fix(s, "Object at 0x%p not freed", object);
- if (slab_free) { - stat(s, FREE_SLAB); - free_slab(s, slab_free); - } + return checks_ok; } #endif /* CONFIG_SLUB_DEBUG */
-#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct slab *)) { @@@ -2958,12 -2949,12 +2958,12 @@@ spin_unlock_irqrestore(&n->list_lock, flags); return x; } -#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
+#ifdef CONFIG_SLUB_DEBUG static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { -#ifdef CONFIG_SLUB_DEBUG static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; @@@ -2994,11 -2985,8 +2994,11 @@@ pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } -#endif } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif
static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { @@@ -3008,7 -2996,6 +3008,7 @@@ return true; }
+#ifndef CONFIG_SLUB_TINY /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@@ -3296,13 -3283,45 +3296,13 @@@ static void *__slab_alloc(struct kmem_c return p; }
-/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - -/* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. - */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void *object; struct kmem_cache_cpu *c; struct slab *slab; unsigned long tid; - struct obj_cgroup *objcg = NULL; - bool init = false; - - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) - return NULL; - - object = kfence_alloc(s, orig_size, gfpflags); - if (unlikely(object)) - goto out; + void *object;
redo: /* @@@ -3372,95 -3391,22 +3372,95 @@@ stat(s, ALLOC_FASTPATH); }
+ return object; +} +#else /* CONFIG_SLUB_TINY */ +static void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + struct partial_context pc; + struct slab *slab; + void *object; + + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + object = get_partial(s, node, &pc); + + if (object) + return object; + + slab = new_slab(s, gfpflags, node); + if (unlikely(!slab)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + object = alloc_single_from_new_slab(s, slab, orig_size); + + return object; +} +#endif /* CONFIG_SLUB_TINY */ + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + struct obj_cgroup *objcg = NULL; + bool init = false; + + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + if (!s) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s);
out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + */ + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size);
return object; }
-static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, +static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, unsigned long addr, size_t orig_size) { return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); }
-static __always_inline +static __fastpath_inline void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { @@@ -3502,67 -3448,6 +3502,67 @@@ void *kmem_cache_alloc_node(struct kmem } EXPORT_SYMBOL(kmem_cache_alloc_node);
+static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} + /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@@ -3588,8 -3473,8 +3588,8 @@@ static void __slab_free(struct kmem_cac if (kfence_free(head)) return;
- if (kmem_cache_debug(s)) { - free_debug_processing(s, slab, head, tail, cnt, addr); + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + free_to_partial_list(s, slab, head, tail, cnt, addr); return; }
@@@ -3689,7 -3574,6 +3689,7 @@@ slab_empty discard_slab(s, slab); }
+#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@@ -3764,18 -3648,8 +3764,18 @@@ redo } stat(s, FREE_FASTPATH); } +#else /* CONFIG_SLUB_TINY */ +static void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) +{ + void *tail_obj = tail ? : head; + + __slab_free(s, slab, head, tail_obj, cnt, addr); +} +#endif /* CONFIG_SLUB_TINY */
-static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, +static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { @@@ -3908,13 -3782,18 +3908,13 @@@ void kmem_cache_free_bulk(struct kmem_c } EXPORT_SYMBOL(kmem_cache_free_bulk);
-/* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +#ifndef CONFIG_SLUB_TINY +static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) { struct kmem_cache_cpu *c; int i; - struct obj_cgroup *objcg = NULL;
- /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (unlikely(!s)) - return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@@ -3968,72 -3847,19 +3968,72 @@@ local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab);
- /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); return i; + error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); + kmem_cache_free_bulk(s, i, p); + return 0; + +} +#else /* CONFIG_SLUB_TINY */ +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) +{ + int i; + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); + + if (unlikely(object)) { + p[i] = object; + continue; + } + + p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, + _RET_IP_, s->object_size); + if (unlikely(!p[i])) + goto error; + + maybe_wipe_obj_freeptr(s, p[i]); + } + + return i; + +error: + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } +#endif /* CONFIG_SLUB_TINY */ + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; + struct obj_cgroup *objcg = NULL; + + if (!size) + return 0; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (unlikely(!s)) + return 0; + + i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (i != 0) + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size); + return i; +} EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@@ -4057,8 -3883,7 +4057,8 @@@ * take the list_lock. */ static unsigned int slub_min_order; -static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; static unsigned int slub_min_objects;
/* @@@ -4189,12 -4014,10 +4189,12 @@@ init_kmem_cache_node(struct kmem_cache_ #endif }
+#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu));
/* * Must align to double word boundary for the double cmpxchg @@@ -4210,12 -4033,6 +4210,12 @@@
return 1; } +#else +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + return 1; +} +#endif /* CONFIG_SLUB_TINY */
static struct kmem_cache *kmem_cache_node;
@@@ -4278,9 -4095,7 +4278,9 @@@ static void free_kmem_cache_nodes(struc void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); +#ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); +#endif free_kmem_cache_nodes(s); }
@@@ -4387,8 -4202,7 +4387,8 @@@ static int calculate_sizes(struct kmem_ */ s->inuse = size;
- if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + if (slub_debug_orig_size(s) || + (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || s->ctor) { /* @@@ -5058,10 -4872,8 +5058,10 @@@ void __init kmem_cache_init(void
void __init kmem_cache_init_late(void) { +#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); +#endif }
struct kmem_cache * @@@ -5112,7 -4924,7 +5112,7 @@@ int __kmem_cache_create(struct kmem_cac return 0; }
-#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { return slab->inuse; @@@ -5370,7 -5182,7 +5370,7 @@@ static void process_slab(struct loc_tra #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */
-#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@@ -5690,13 -5502,11 +5690,13 @@@ static ssize_t cache_dma_show(struct km SLAB_ATTR_RO(cache_dma); #endif
+#ifdef CONFIG_HARDENED_USERCOPY static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); +#endif
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { @@@ -5776,21 -5586,7 +5776,21 @@@ static ssize_t failslab_show(struct kme { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } -SLAB_ATTR_RO(failslab); + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + if (buf[0] == '1') + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + + return length; +} +SLAB_ATTR(failslab); #endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf) @@@ -6007,9 -5803,7 +6007,9 @@@ static struct attribute *slab_attrs[] #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY &usersize_attr.attr, +#endif #ifdef CONFIG_KFENCE &skip_kfence_attr.attr, #endif @@@ -6126,6 -5920,11 +6126,6 @@@ static int sysfs_slab_add(struct kmem_c struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s);
- if (!kset) { - kobject_init(&s->kobj, &slab_ktype); - return 0; - } - if (!unmergeable && disable_higher_order_debug && (slub_debug & DEBUG_METADATA_FLAGS)) unmergeable = 1; @@@ -6255,8 -6054,9 +6255,8 @@@ static int __init slab_sysfs_init(void mutex_unlock(&slab_mutex); return 0; } - -__initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +late_initcall(slab_sysfs_init); +#endif /* SLAB_SUPPORTS_SYSFS */
#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) static int slab_debugfs_show(struct seq_file *seq, void *v) diff --combined mm/swapfile.c index 72e481aacd5d,4ee31056d3f8..3eedf7ae957f --- a/mm/swapfile.c +++ b/mm/swapfile.c @@@ -772,8 -772,7 +772,7 @@@ static void set_cluster_next(struct swa /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; - next = si->lowest_bit + - prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } @@@ -973,23 -972,23 +972,23 @@@ done scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; } offset = si->lowest_bit; while (offset < scan_base) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; offset++; } spin_lock(&si->lock); @@@ -3089,7 -3088,7 +3088,7 @@@ SYSCALL_DEFINE2(swapon, const char __us */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = - 1 + prandom_u32_max(p->highest_bit); + get_random_u32_inclusive(1, p->highest_bit); } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
diff --combined net/802/mrp.c index 6c927d4b35f0,8c6f0381023b..66fcbf23b486 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@@ -592,7 -592,7 +592,7 @@@ static void mrp_join_timer_arm(struct m { unsigned long delay;
- delay = prandom_u32_max(msecs_to_jiffies(mrp_join_time)); + delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); }
@@@ -606,10 -606,7 +606,10 @@@ static void mrp_join_timer(struct timer spin_unlock(&app->lock);
mrp_queue_xmit(app); - mrp_join_timer_arm(app); + spin_lock(&app->lock); + if (likely(app->active)) + mrp_join_timer_arm(app); + spin_unlock(&app->lock); }
static void mrp_periodic_timer_arm(struct mrp_applicant *app) @@@ -623,12 -620,11 +623,12 @@@ static void mrp_periodic_timer(struct t struct mrp_applicant *app = from_timer(app, t, periodic_timer);
spin_lock(&app->lock); - mrp_mad_event(app, MRP_EVENT_PERIODIC); - mrp_pdu_queue(app); + if (likely(app->active)) { + mrp_mad_event(app, MRP_EVENT_PERIODIC); + mrp_pdu_queue(app); + mrp_periodic_timer_arm(app); + } spin_unlock(&app->lock); - - mrp_periodic_timer_arm(app); }
static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) @@@ -876,7 -872,6 +876,7 @@@ int mrp_init_applicant(struct net_devic app->dev = dev; app->app = appl; app->mad = RB_ROOT; + app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); @@@ -905,9 -900,6 +905,9 @@@ void mrp_uninit_applicant(struct net_de
RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+ spin_lock_bh(&app->lock); + app->active = false; + spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ diff --combined net/bluetooth/mgmt.c index 0dd30a3beb77,81ce668b0b77..d2ea8e19aa1b --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@@ -7373,9 -7373,8 +7373,8 @@@ static int get_conn_info(struct sock *s /* To avoid client trying to guess when to poll again for information we * calculate conn info age as random value between min/max set in hdev. */ - conn_info_age = hdev->conn_info_min_age + - prandom_u32_max(hdev->conn_info_max_age - - hdev->conn_info_min_age); + conn_info_age = get_random_u32_inclusive(hdev->conn_info_min_age, + hdev->conn_info_max_age - 1);
/* Query controller to refresh cached values if they are too old or were * never read. @@@ -8859,7 -8858,7 +8858,7 @@@ static int add_ext_adv_params(struct so * extra parameters we don't know about will be ignored in this request. */ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS);
flags = __le32_to_cpu(cp->flags); diff --combined net/can/j1939/transport.c index f26f4cfa9e63,67d36776aff4..5c722b55fe23 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@@ -987,7 -987,7 +987,7 @@@ static int j1939_session_tx_eoma(struc /* wait for the EOMA packet to come in */ j1939_tp_set_rxtimeout(session, 1250);
- netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session); + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
return 0; } @@@ -1168,7 -1168,7 +1168,7 @@@ static enum hrtimer_restart j1939_tp_tx if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) { session->tx_retry++; j1939_tp_schedule_txtimer(session, - 10 + prandom_u32_max(16)); + 10 + get_random_u32_below(16)); } else { netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n", __func__, session); diff --combined net/core/neighbour.c index 952a54763358,ba92762de525..f00a79fc301b --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@@ -111,7 -111,7 +111,7 @@@ static void neigh_cleanup_and_release(s
unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? prandom_u32_max(base) + (base >> 1) : 0; + return base ? get_random_u32_below(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time);
@@@ -307,31 -307,7 +307,31 @@@ static int neigh_del_timer(struct neigh return 0; }
-static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) { struct sk_buff_head tmp; unsigned long flags; @@@ -345,7 -321,13 +345,7 @@@ struct net_device *dev = skb->dev;
if (net == NULL || net_eq(dev_net(dev), net)) { - struct in_device *in_dev; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, family); __skb_unlink(skb, list); __skb_queue_tail(&tmp, skb); } @@@ -427,8 -409,7 +427,8 @@@ static int __neigh_ifdown(struct neigh_ write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) del_timer_sync(&tbl->proxy_timer); return 0; @@@ -1640,8 -1621,13 +1640,8 @@@ static void neigh_proxy_process(struct
if (tdif <= 0) { struct net_device *dev = skb->dev; - struct in_device *in_dev;
- rcu_read_lock(); - in_dev = __in_dev_get_rcu(dev); - if (in_dev) - in_dev->arp_parms->qlen--; - rcu_read_unlock(); + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue);
if (tbl->proxy_redo && netif_running(dev)) { @@@ -1666,7 -1652,7 +1666,7 @@@ void pneigh_enqueue(struct neigh_table struct sk_buff *skb) { unsigned long sched_next = jiffies + - prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); + get_random_u32_below(NEIGH_VAR(p, PROXY_DELAY));
if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); @@@ -1835,7 -1821,7 +1835,7 @@@ int neigh_table_clear(int index, struc cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue, NULL); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@@ -3553,6 -3539,18 +3553,6 @@@ static int proc_unres_qlen(struct ctl_t return ret; }
-static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, - int family) -{ - switch (family) { - case AF_INET: - return __in_dev_arp_parms_get_rcu(dev); - case AF_INET6: - return __in6_dev_nd_parms_get_rcu(dev); - } - return NULL; -} - static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { diff --combined net/ipv4/inet_connection_sock.c index 4a34bc7cb15e,f22051219b50..b366ab9148f2 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@@ -314,7 -314,7 +314,7 @@@ other_half_scan if (likely(remaining > 1)) remaining &= ~1U;
- offset = prandom_u32_max(remaining); + offset = get_random_u32_below(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ @@@ -471,11 -471,11 +471,11 @@@ int inet_csk_get_port(struct sock *sk, bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; - int ret = 1, port = snum, l3mdev; struct net *net = sock_net(sk);
l3mdev = inet_sk_bound_l3mdev(sk); @@@ -1186,7 -1186,7 +1186,7 @@@ int inet_csk_listen_start(struct sock * { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - int err = -EADDRINUSE; + int err;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
@@@ -1202,8 -1202,7 +1202,8 @@@ * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); - if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + err = sk->sk_prot->get_port(sk, inet->inet_num); + if (!err) { inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk); diff --combined net/ipv4/inet_hashtables.c index 3cec471a2cd2,a879ec1a267d..d039b4e732a3 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@@ -858,80 -858,34 +858,80 @@@ inet_bhash2_addr_any_hashbucket(const s return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; }
-int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +static void inet_update_saddr(struct sock *sk, void *saddr, int family) +{ + if (family == AF_INET) { + inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else { + sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; + } +#endif +} + +static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); - struct inet_bind_hashbucket *head2; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + int bhash; + + if (!inet_csk(sk)->icsk_bind2_hash) { + /* Not bind()ed before. */ + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + + return 0; + }
/* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); - if (!new_tb2) + if (!new_tb2) { + if (reset) { + /* The (INADDR_ANY, port) bucket might have already + * been freed, then we cannot fixup icsk_bind2_hash, + * so we give up and unlink sk from bhash/bhash2 not + * to leave inconsistency in bhash2. + */ + inet_put_port(sk); + inet_reset_saddr(sk); + } + return -ENOMEM; + }
+ bhash = inet_bhashfn(net, port, hinfo->bhash_size); + head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
- if (prev_saddr) { - spin_lock_bh(&prev_saddr->lock); - __sk_del_bind2_node(sk); - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); - } + /* If we change saddr locklessly, another thread + * iterating over bhash might see corrupted address. + */ + spin_lock_bh(&head->lock);
- spin_lock_bh(&head2->lock); + spin_lock(&head2->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); + spin_unlock(&head2->lock); + + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; @@@ -939,40 -893,26 +939,40 @@@ } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; - spin_unlock_bh(&head2->lock); + spin_unlock(&head2->lock); + + spin_unlock_bh(&head->lock);
if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
return 0; } + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +{ + return __inet_bhash2_update_saddr(sk, saddr, family, false); +} EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
+void inet_bhash2_reset_saddr(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + __inet_bhash2_update_saddr(sk, NULL, 0, true); +} +EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); + /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. + * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though - * attacks were since demonstrated, thus we use 65536 instead to really - * give more isolation and privacy, at the expense of 256kB of kernel - * memory. + * attacks were since demonstrated, thus we use 65536 by default instead + * to really give more isolation and privacy, at the expense of 256kB + * of kernel memory. */ -#define INET_TABLE_PERTURB_SHIFT 16 -#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row, @@@ -1097,7 -1037,7 +1097,7 @@@ ok * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, prandom_u32_max(8) * 2); + i = max_t(int, i, get_random_u32_below(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
/* Head lock still held and bh's disabled */ diff --combined net/ipv4/tcp_input.c index 1efacbe948da,23cf418efe4f..cc072d2cfcd8 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@@ -3646,7 -3646,8 +3646,8 @@@ static void tcp_send_challenge_ack(stru u32 half = (ack_limit + 1) >> 1;
WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); - WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_count, + get_random_u32_inclusive(half, ack_limit + half - 1)); } count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { @@@ -4764,8 -4765,8 +4765,8 @@@ static void tcp_ofo_queue(struct sock * } }
-static bool tcp_prune_ofo_queue(struct sock *sk); -static int tcp_prune_queue(struct sock *sk); +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) @@@ -4773,11 -4774,11 +4774,11 @@@ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) {
- if (tcp_prune_queue(sk) < 0) + if (tcp_prune_queue(sk, skb) < 0) return -1;
while (!sk_rmem_schedule(sk, skb, size)) { - if (!tcp_prune_ofo_queue(sk)) + if (!tcp_prune_ofo_queue(sk, skb)) return -1; } } @@@ -5329,8 -5330,6 +5330,8 @@@ new_range * Clean the out-of-order queue to make room. * We drop high sequences packets to : * 1) Let a chance for holes to be filled. + * This means we do not drop packets from ooo queue if their sequence + * is before incoming packet sequence. * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) @@@ -5338,31 -5337,24 +5339,31 @@@ * * Return true if queue has shrunk. */ -static bool tcp_prune_ofo_queue(struct sock *sk) +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + bool pruned = false; int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; + do { + struct sk_buff *skb = rb_to_skb(node); + + /* If incoming skb would land last in ofo queue, stop pruning. */ + if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq)) + break; + pruned = true; prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - goal -= rb_to_skb(node)->truesize; - tcp_drop_reason(sk, rb_to_skb(node), - SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); + goal -= skb->truesize; + tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); + tp->ooo_last_skb = rb_to_skb(prev); if (!prev || goal <= 0) { if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) @@@ -5371,18 -5363,16 +5372,18 @@@ } node = prev; } while (node); - tp->ooo_last_skb = rb_to_skb(prev);
- /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(&tp->rx_opt); - return true; + if (pruned) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + } + return pruned; }
/* Reduce allocated memory if we can, trying to get @@@ -5392,7 -5382,7 +5393,7 @@@ * until the socket owning process reads some of the data * to stabilize the situation. */ -static int tcp_prune_queue(struct sock *sk) +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk);
@@@ -5419,7 -5409,7 +5420,7 @@@ /* Collapsing did not help, destructive actions follow. * This must not ever occur. */
- tcp_prune_ofo_queue(sk); + tcp_prune_ofo_queue(sk, in_skb);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@@ -6841,18 -6831,10 +6842,18 @@@ static bool tcp_syn_flood_action(const #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!queue->synflood_warned && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) - net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, sk->sk_num, msg); + if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && + xchg(&queue->synflood_warned, 1) == 0) { + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { + net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", + proto, inet6_rcv_saddr(sk), + sk->sk_num, msg); + } else { + net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n", + proto, &sk->sk_rcv_saddr, + sk->sk_num, msg); + } + }
return want_cookie; } diff --combined net/ipv6/addrconf.c index c338dfb05d2c,d720f6f5de3f..f7a84a4acffc --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@@ -104,7 -104,7 +104,7 @@@ static inline u32 cstamp_delta(unsigne static inline s32 rfc3315_s14_backoff_init(s32 irt) { /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ - u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt; + u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt; do_div(tmp, 1000000); return (s32)tmp; } @@@ -112,11 -112,11 +112,11 @@@ static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) { /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ - u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt; + u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt; do_div(tmp, 1000000); if ((s32)tmp > mrt) { /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ - tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt; + tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt; do_div(tmp, 1000000); } return (s32)tmp; @@@ -3320,7 -3320,7 +3320,7 @@@ static void addrconf_addr_gen(struct in return;
/* no link local addresses on devices flagged as slaves */ - if (idev->dev->flags & IFF_SLAVE) + if (idev->dev->priv_flags & IFF_NO_ADDRCONF) return;
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); @@@ -3560,7 -3560,7 +3560,7 @@@ static int addrconf_notify(struct notif if (idev && idev->cnf.disable_ipv6) break;
- if (dev->flags & IFF_SLAVE) { + if (dev->priv_flags & IFF_NO_ADDRCONF) { if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) ipv6_mc_up(idev); @@@ -3967,7 -3967,7 +3967,7 @@@ static void addrconf_dad_kick(struct in if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1); + rand_num = get_random_u32_below(idev->cnf.rtr_solicit_delay ? : 1);
nonce = 0; if (idev->cnf.enhanced_dad || diff --combined net/netfilter/nf_conntrack_core.c index 5c3cf0834af0,8703812405eb..496c4920505b --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@@ -211,24 -211,28 +211,24 @@@ static u32 hash_conntrack_raw(const str unsigned int zoneid, const struct net *net) { - struct { - struct nf_conntrack_man src; - union nf_inet_addr dst_addr; - unsigned int zone; - u32 net_mix; - u16 dport; - u16 proto; - } __aligned(SIPHASH_ALIGNMENT) combined; + u64 a, b, c, d;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
- memset(&combined, 0, sizeof(combined)); + /* The direction must be ignored, handle usable tuplehash members manually */ + a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; + b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3];
- /* The direction must be ignored, so handle usable members manually. */ - combined.src = tuple->src; - combined.dst_addr = tuple->dst.u3; - combined.zone = zoneid; - combined.net_mix = net_hash_mix(net); - combined.dport = (__force __u16)tuple->dst.u.all; - combined.proto = tuple->dst.protonum; + c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; + c |= tuple->dst.protonum;
- return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); + d = (u64)zoneid << 32 | net_hash_mix(net); + + /* IPv4: u3.all[1,2,3] == 0 */ + c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; + d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; + + return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); }
static u32 scale_hash(u32 hash) @@@ -887,7 -891,7 +887,7 @@@ nf_conntrack_hash_check_insert(struct n zone = nf_ct_zone(ct);
if (!nf_ct_ext_valid_pre(ct->ext)) { - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC_ATOMIC(net, insert_failed); return -ETIMEDOUT; }
@@@ -902,7 -906,7 +902,7 @@@ nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
- max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
/* See if there's one in the list already, including reverse */ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { @@@ -934,7 -938,7 +934,7 @@@
if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); - NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return -ETIMEDOUT; }
@@@ -1223,7 -1227,7 +1223,7 @@@ __nf_conntrack_confirm(struct sk_buff * goto dying; }
- max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@@ -1271,7 -1275,7 +1271,7 @@@ chaintoolong */ if (!nf_ct_ext_valid_post(ct->ext)) { nf_ct_kill(ct); - NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; }
@@@ -1777,7 -1781,7 +1777,7 @@@ init_conntrack(struct net *net, struct }
#ifdef CONFIG_NF_CONNTRACK_MARK - ct->mark = exp->master->mark; + ct->mark = READ_ONCE(exp->master->mark); #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK ct->secmark = exp->master->secmark; diff --combined net/netlink/af_netlink.c index d73091f6bb0f,7a401d94463a..bca2a470ccad --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@@ -812,17 -812,6 +812,17 @@@ static int netlink_release(struct socke }
sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); + + /* Because struct net might disappear soon, do not keep a pointer. */ + if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { + __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); + /* Because of deferred_put_nlk_sk and use of work queue, + * it is possible netns will be freed before this socket. + */ + sock_net_set(sk, &init_net); + __netns_tracker_alloc(&init_net, &sk->ns_tracker, + false, GFP_KERNEL); + } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } @@@ -846,7 -835,7 +846,7 @@@ retry /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ - rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); + rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; @@@ -2499,24 -2488,19 +2499,24 @@@ void netlink_ack(struct sk_buff *in_skb flags |= NLM_F_ACK_TLVS;
skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); - if (!skb) { - NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; - sk_error_report(NETLINK_CB(in_skb).sk); - return; - } + if (!skb) + goto err_skb;
rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, flags); + NLMSG_ERROR, sizeof(*errmsg), flags); + if (!rep) + goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; - unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) - ? nlh->nlmsg_len : sizeof(*nlh), - /* Bounds checked by the skb layer. */); + errmsg->msg = *nlh; + + if (!(flags & NLM_F_CAPPED)) { + if (!nlmsg_append(skb, nlmsg_len(nlh))) + goto err_bad_put; + + memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), + nlmsg_len(nlh)); + }
if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); @@@ -2524,14 -2508,6 +2524,14 @@@ nlmsg_end(skb, rep);
nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); + + return; + +err_bad_put: + nlmsg_free(skb); +err_skb: + NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack);
diff --combined net/packet/af_packet.c index 41c4ccc3a5d6,51a47ade92e8..b5ab98ca2511 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@@ -1350,7 -1350,7 +1350,7 @@@ static bool fanout_flow_is_huge(struct if (READ_ONCE(history[i]) == rxhash) count++;
- victim = prandom_u32_max(ROLLOVER_HLEN); + victim = get_random_u32_below(ROLLOVER_HLEN);
/* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) @@@ -1386,7 -1386,7 +1386,7 @@@ static unsigned int fanout_demux_rnd(st struct sk_buff *skb, unsigned int num) { - return prandom_u32_max(num); + return get_random_u32_below(num); }
static unsigned int fanout_demux_rollover(struct packet_fanout *f, @@@ -1777,7 -1777,6 +1777,7 @@@ static int fanout_add(struct sock *sk, match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; + match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; @@@ -2294,7 -2293,8 +2294,7 @@@ static int tpacket_rcv(struct sk_buff * if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID;
if (snaplen > res) @@@ -3277,7 -3277,7 +3277,7 @@@ static int packet_bind_spkt(struct sock int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data) + 1]; + char name[sizeof(uaddr->sa_data_min) + 1];
/* * Check legality @@@ -3288,8 -3288,8 +3288,8 @@@ /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); - name[sizeof(uaddr->sa_data)] = 0; + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); + name[sizeof(uaddr->sa_data_min)] = 0;
return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } @@@ -3520,7 -3520,8 +3520,7 @@@ static int packet_recvmsg(struct socke if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && - (skb->ip_summed == CHECKSUM_COMPLETE || - skb_csum_unnecessary(skb))) + skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID;
aux.tp_len = origlen; @@@ -3560,11 -3561,11 +3560,11 @@@ static int packet_getname_spkt(struct s return -EOPNOTSUPP;
uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock();
return sizeof(*uaddr); diff --combined net/sched/act_gact.c index 54f1b13b2360,be267ffaaba7..904ab3d457ef --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@@ -18,7 -18,6 +18,7 @@@ #include <net/pkt_cls.h> #include <linux/tc_act/tc_gact.h> #include <net/tc_act/tc_gact.h> +#include <net/tc_wrapper.h>
static struct tc_action_ops act_gact_ops;
@@@ -26,7 -25,7 +26,7 @@@ static int gact_net_rand(struct tcf_gact *gact) { smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ - if (prandom_u32_max(gact->tcfg_pval)) + if (get_random_u32_below(gact->tcfg_pval)) return gact->tcf_action; return gact->tcfg_paction; } @@@ -146,9 -145,8 +146,9 @@@ release_idr return err; }
-static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_gact_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); diff --combined net/sched/act_sample.c index 98dea08c1764,4194480746b0..f7416b5598e0 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@@ -20,7 -20,6 +20,7 @@@ #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <net/pkt_cls.h> +#include <net/tc_wrapper.h>
#include <linux/if_arp.h>
@@@ -154,9 -153,8 +154,9 @@@ static bool tcf_sample_dev_ok_push(stru } }
-static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; @@@ -170,7 -168,7 +170,7 @@@ psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */ - if (psample_group && (prandom_u32_max(s->rate) == 0)) { + if (psample_group && (get_random_u32_below(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; diff --combined net/sctp/socket.c index 5acbdf0d38f3,cfe72085fdc4..84021a6c4f9d --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@@ -5098,17 -5098,13 +5098,17 @@@ static void sctp_destroy_sock(struct so }
/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_sock(struct sock *sk) +static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk);
/* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); +}
+static void sctp_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); inet_sock_destruct(sk); }
@@@ -5315,14 -5311,14 +5315,14 @@@ EXPORT_SYMBOL_GPL(sctp_for_each_endpoin
int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr, void *p) + const union sctp_addr *paddr, void *p, int dif) { struct sctp_transport *transport; struct sctp_endpoint *ep; int err = -ENOENT;
rcu_read_lock(); - transport = sctp_addrs_lookup_transport(net, laddr, paddr); + transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif); if (!transport) { rcu_read_unlock(); return err; @@@ -8323,7 -8319,7 +8323,7 @@@ static int sctp_get_port_local(struct s
inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32_max(remaining) + low; + rover = get_random_u32_below(remaining) + low;
do { rover++; @@@ -8398,7 -8394,6 +8398,7 @@@ pp_found * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { + int bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); struct sctp_sock *sp2 = sctp_sk(sk2); struct sctp_endpoint *ep2 = sp2->ep;
@@@ -8409,9 -8404,7 +8409,9 @@@ uid_eq(uid, sock_i_uid(sk2)))) continue;
- if (sctp_bind_addr_conflict(&ep2->base.bind_addr, + if ((!sk->sk_bound_dev_if || !bound_dev_if2 || + sk->sk_bound_dev_if == bound_dev_if2) && + sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, sp2, sp)) { ret = 1; goto fail_unlock; @@@ -9434,7 -9427,7 +9434,7 @@@ void sctp_copy_sock(struct sock *newsk sctp_sk(newsk)->reuse = sp->reuse;
newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sctp_destruct_sock; + newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@@ -9669,20 -9662,11 +9669,20 @@@ struct proto sctp_prot =
#if IS_ENABLED(CONFIG_IPV6)
-#include <net/transp_v6.h> -static void sctp_v6_destroy_sock(struct sock *sk) +static void sctp_v6_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +static int sctp_v6_init_sock(struct sock *sk) { - sctp_destroy_sock(sk); - inet6_destroy_sock(sk); + int ret = sctp_init_sock(sk); + + if (!ret) + sk->sk_destruct = sctp_v6_destruct_sock; + + return ret; }
struct proto sctpv6_prot = { @@@ -9692,8 -9676,8 +9692,8 @@@ .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, - .init = sctp_init_sock, - .destroy = sctp_v6_destroy_sock, + .init = sctp_v6_init_sock, + .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --combined net/sunrpc/xprtsock.c index b3ab6d9d752e,2e4987dcba29..c0506d0d7478 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@@ -364,7 -364,7 +364,7 @@@ static ssize_ xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, struct kvec *kvec, size_t count, size_t seek) { - iov_iter_kvec(&msg->msg_iter, READ, kvec, 1, count); + iov_iter_kvec(&msg->msg_iter, ITER_DEST, kvec, 1, count); return xs_sock_recvmsg(sock, msg, flags, seek); }
@@@ -373,7 -373,7 +373,7 @@@ xs_read_bvec(struct socket *sock, struc struct bio_vec *bvec, unsigned long nr, size_t count, size_t seek) { - iov_iter_bvec(&msg->msg_iter, READ, bvec, nr, count); + iov_iter_bvec(&msg->msg_iter, ITER_DEST, bvec, nr, count); return xs_sock_recvmsg(sock, msg, flags, seek); }
@@@ -381,7 -381,7 +381,7 @@@ static ssize_ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { - iov_iter_discard(&msg->msg_iter, READ, count); + iov_iter_discard(&msg->msg_iter, ITER_DEST, count); return sock_recvmsg(sock, msg, flags); }
@@@ -1619,7 -1619,7 +1619,7 @@@ static int xs_get_random_port(void if (max < min) return -EADDRINUSE; range = max - min + 1; - rand = prandom_u32_max(range); + rand = get_random_u32_below(range); return rand + min; }
diff --combined net/xfrm/xfrm_state.c index cc1d0ea42672,d63a3644ee1a..89c731f4f0c7 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@@ -84,25 -84,6 +84,25 @@@ static unsigned int xfrm_seq_hash(struc return __xfrm_seq_hash(seq, net->xfrm.state_hmask); }
+#define XFRM_STATE_INSERT(by, _n, _h, _type) \ + { \ + struct xfrm_state *_x = NULL; \ + \ + if (_type != XFRM_DEV_OFFLOAD_PACKET) { \ + hlist_for_each_entry_rcu(_x, _h, by) { \ + if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + continue; \ + break; \ + } \ + } \ + \ + if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + /* SAD is empty or consist from HW SAs only */ \ + hlist_add_head_rcu(_n, _h); \ + else \ + hlist_add_before_rcu(_n, &_x->by); \ + } + static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, @@@ -119,25 -100,23 +119,25 @@@ h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bydst, ndsttable + h); + XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type);
h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bysrc, nsrctable + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type);
if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); - hlist_add_head_rcu(&x->byspi, nspitable + h); + XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h, + x->xso.type); }
if (x->km.seq) { h = __xfrm_seq_hash(x->km.seq, nhashmask); - hlist_add_head_rcu(&x->byseq, nseqtable + h); + XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h, + x->xso.type); } } } @@@ -570,8 -549,6 +570,8 @@@ static enum hrtimer_restart xfrm_timer_ int err = 0;
spin_lock(&x->lock); + xfrm_dev_state_update_curlft(x); + if (x->km.state == XFRM_STATE_DEAD) goto out; if (x->km.state == XFRM_STATE_EXPIRED) @@@ -974,49 -951,6 +974,49 @@@ xfrm_init_tempstate(struct xfrm_state * x->props.family = tmpl->encap_family; }
+static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family, + struct xfrm_dev_offload *xdo) +{ + unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + struct xfrm_state *x; + + hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { +#ifdef CONFIG_XFRM_OFFLOAD + if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (xdo->dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + return x; + } + + return NULL; +} + static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, @@@ -1158,23 -1092,6 +1158,23 @@@ xfrm_state_find(const xfrm_address_t *d rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@@ -1192,23 -1109,6 +1192,23 @@@
h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@@ -1226,10 -1126,8 +1226,10 @@@ found x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, - tmpl->id.proto, encap_family)) != NULL) { + (x0 = __xfrm_state_lookup_all(net, mark, daddr, + tmpl->id.spi, tmpl->id.proto, + encap_family, + &pol->xdo)) != NULL) { to_put = x0; error = -EEXIST; goto out; @@@ -1263,53 -1161,21 +1263,53 @@@ x = NULL; goto out; } - +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + struct xfrm_dev_offload *xdo = &pol->xdo; + struct xfrm_dev_offload *xso = &x->xso; + + xso->type = XFRM_DEV_OFFLOAD_PACKET; + xso->dir = xdo->dir; + xso->dev = xdo->dev; + xso->real_dev = xdo->real_dev; + netdev_tracker_alloc(xso->dev, &xso->dev_tracker, + GFP_ATOMIC); + error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x); + if (error) { + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + x->km.state = XFRM_STATE_DEAD; + to_put = x; + x = NULL; + goto out; + } + } +#endif if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, + net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, + net->xfrm.state_bysrc + h, + x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, + net->xfrm.state_byspi + h, + x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, + net->xfrm.state_byseq + h, + x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, @@@ -1319,18 -1185,6 +1319,18 @@@ xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { +#ifdef CONFIG_XFRM_OFFLOAD + struct xfrm_dev_offload *xso = &x->xso; + + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { + xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + } +#endif x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; @@@ -1426,26 -1280,22 +1426,26 @@@ static void __xfrm_state_insert(struct
h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type);
h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type);
if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family);
- hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); }
if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq);
- hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, + x->xso.type); }
hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); @@@ -1559,11 -1409,9 +1559,11 @@@ static struct xfrm_state *__find_acq_co ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type);
net->xfrm.state_num++;
@@@ -1938,8 -1786,6 +1938,8 @@@ EXPORT_SYMBOL(xfrm_state_update)
int xfrm_state_check_expire(struct xfrm_state *x) { + xfrm_dev_state_update_curlft(x); + if (!x->curlft.use_time) x->curlft.use_time = ktime_get_real_seconds();
@@@ -2171,7 -2017,7 +2171,7 @@@ u32 xfrm_get_acqseq(void } EXPORT_SYMBOL(xfrm_get_acqseq);
-int verify_spi_info(u8 proto, u32 min, u32 max) +int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack) { switch (proto) { case IPPROTO_AH: @@@ -2180,28 -2026,22 +2180,28 @@@
case IPPROTO_COMP: /* IPCOMP spi is 16-bits. */ - if (max >= 0x10000) + if (max >= 0x10000) { + NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535"); return -EINVAL; + } break;
default: + NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP"); return -EINVAL; }
- if (min > max) + if (min > max) { + NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max"); return -EINVAL; + }
return 0; } EXPORT_SYMBOL(verify_spi_info);
-int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) +int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, + struct netlink_ext_ack *extack) { struct net *net = xs_net(x); unsigned int h; @@@ -2213,10 -2053,8 +2213,10 @@@ u32 mark = x->mark.v & x->mark.m;
spin_lock_bh(&x->lock); - if (x->km.state == XFRM_STATE_DEAD) + if (x->km.state == XFRM_STATE_DEAD) { + NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state"); goto unlock; + }
err = 0; if (x->id.spi) @@@ -2227,7 -2065,6 +2227,7 @@@ if (minspi == maxspi) { x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); if (x0) { + NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); xfrm_state_put(x0); goto unlock; } @@@ -2235,7 -2072,7 +2235,7 @@@ } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { - spi = low + prandom_u32_max(high - low + 1); + spi = get_random_u32_inclusive(low, high); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); @@@ -2248,13 -2085,10 +2248,13 @@@ spin_lock_bh(&net->xfrm.xfrm_state_lock); x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock);
err = 0; + } else { + NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); }
unlock:
linux-merge@lists.open-mesh.org