The following commit has been merged in the master branch: commit 9ea925c806dbb8fee6797f59148daaf7f648832e Merge: cb69d86550b3f47be50fa5751d31ebbdb71b18ee 35b603f8a78b0bd51566db277c4f7b56b3ff6bac Author: Linus Torvalds torvalds@linux-foundation.org Date: Tue Sep 17 07:25:37 2024 +0200
Merge tag 'timers-core-2024-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer updates from Thomas Gleixner: "Core:
- Overhaul of posix-timers in preparation of removing the workaround for periodic timers which have signal delivery ignored.
- Remove the historical extra jiffie in msleep()
msleep() adds an extra jiffie to the timeout value to ensure minimal sleep time. The timer wheel ensures minimal sleep time since the large rewrite to a non-cascading wheel, but the extra jiffie in msleep() remained unnoticed. Remove it.
- Make the timer slack handling correct for realtime tasks.
The procfs interface is inconsistent and does neither reflect reality nor conforms to the man page. Show the correct 0 slack for real time tasks and enforce it at the core level instead of having inconsistent individual checks in various timer setup functions.
- The usual set of updates and enhancements all over the place.
Drivers:
- Allow the ACPI PM timer to be turned off during suspend
- No new drivers
- The usual updates and enhancements in various drivers"
* tag 'timers-core-2024-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (43 commits) ntp: Make sure RTC is synchronized when time goes backwards treewide: Fix wrong singular form of jiffies in comments cpu: Use already existing usleep_range() timers: Rename next_expiry_recalc() to be unique platform/x86:intel/pmc: Fix comment for the pmc_core_acpi_pm_timer_suspend_resume function clocksource/drivers/jcore: Use request_percpu_irq() clocksource/drivers/cadence-ttc: Add missing clk_disable_unprepare in ttc_setup_clockevent clocksource/drivers/asm9260: Add missing clk_disable_unprepare in asm9260_timer_init clocksource/drivers/qcom: Add missing iounmap() on errors in msm_dt_timer_init() clocksource/drivers/ingenic: Use devm_clk_get_enabled() helpers platform/x86:intel/pmc: Enable the ACPI PM Timer to be turned off when suspended clocksource: acpi_pm: Add external callback for suspend/resume clocksource/drivers/arm_arch_timer: Using for_each_available_child_of_node_scoped() dt-bindings: timer: rockchip: Add rk3576 compatible timers: Annotate possible non critical data race of next_expiry timers: Remove historical extra jiffie for timeout in msleep() hrtimer: Use and report correct timerslack values for realtime tasks hrtimer: Annotate hrtimer_cpu_base_.*_expiry() for sparse. timers: Add sparse annotation for timer_sync_wait_running(). signal: Replace BUG_ON()s ...
diff --combined fs/proc/base.c index 1ad51858528f7,afe573e8d8f1d..e7810f3bd522d --- a/fs/proc/base.c +++ b/fs/proc/base.c @@@ -85,7 -85,6 +85,7 @@@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@@ -118,40 -117,6 +118,40 @@@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@@ -862,31 -827,12 +862,31 @@@ static int __mem_open(struct inode *ino
static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +}
- return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } }
static ssize_t mem_rw(struct file *file, char __user *buf, @@@ -909,9 -855,7 +909,9 @@@ if (!mmget_not_zero(mm)) goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE;
while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); @@@ -988,7 -932,6 +988,7 @@@ static const struct file_operations pro .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, };
static int environ_open(struct inode *inode, struct file *file) @@@ -2333,8 -2276,8 +2333,8 @@@ proc_map_files_instantiate(struct dentr inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); }
static struct dentry *proc_map_files_lookup(struct inode *dir, @@@ -2513,13 -2456,13 +2513,13 @@@ static void *timers_start(struct seq_fi if (!tp->sighand) return ERR_PTR(-ESRCH);
- return seq_list_start(&tp->task->signal->posix_timers, *pos); + return seq_hlist_start(&tp->task->signal->posix_timers, *pos); }
static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); }
static void timers_stop(struct seq_file *m, void *v) @@@ -2548,7 -2491,7 +2548,7 @@@ static int show_timer(struct seq_file * [SIGEV_THREAD] = "thread", };
- timer = list_entry((struct list_head *)v, struct k_itimer, list); + timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id); @@@ -2626,10 -2569,11 +2626,11 @@@ static ssize_t timerslack_ns_write(stru }
task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (task_is_realtime(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p);
out: @@@ -3927,12 -3871,12 +3928,12 @@@ static int proc_task_readdir(struct fil if (!dir_emit_dots(file, ctx)) return 0;
- /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); - tid = (int)file->f_version; - file->f_version = 0; + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { @@@ -3947,7 -3891,7 +3948,7 @@@ proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@@ -3972,24 -3916,6 +3973,24 @@@ static int proc_task_getattr(struct mnt return 0; }
+/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@@ -4000,7 -3926,7 +4001,7 @@@ static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, };
void __init set_proc_pid_nlink(void) diff --combined fs/select.c index 1a4849e2afb97,ad171b7a5c11f..cae82e9e0dcc0 --- a/fs/select.c +++ b/fs/select.c @@@ -77,19 -77,16 +77,16 @@@ u64 select_estimate_accuracy(struct tim { u64 ret; struct timespec64 now; + u64 slack = current->timer_slack_ns;
- /* - * Realtime tasks get a slack of 0 for obvious reasons. - */ - - if (rt_task(current)) + if (slack == 0) return 0;
ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; + if (ret < slack) + return slack; return ret; }
@@@ -840,7 -837,7 +837,7 @@@ SYSCALL_DEFINE1(old_select, struct sel_ struct poll_list { struct poll_list *next; unsigned int len; - struct pollfd entries[]; + struct pollfd entries[] __counted_by(len); };
#define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --combined kernel/cpu.c index 0c9c5dfc8dddb,5bfc2fdd7da07..d293d52a3e00e --- a/kernel/cpu.c +++ b/kernel/cpu.c @@@ -330,7 -330,7 +330,7 @@@ static bool cpuhp_wait_for_sync_state(u /* Poll for one millisecond */ arch_cpuhp_sync_state_poll(); } else { - usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE); + usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC); } sync = atomic_read(st); } @@@ -1808,7 -1808,6 +1808,7 @@@ static int __init parallel_bringup_pars } early_param("cpuhp.parallel", parallel_bringup_parse_param);
+#ifdef CONFIG_HOTPLUG_SMT static inline bool cpuhp_smt_aware(void) { return cpu_smt_max_threads > 1; @@@ -1818,21 -1817,6 +1818,21 @@@ static inline const struct cpumask *cpu { return cpu_primary_thread_mask; } +#else +static inline bool cpuhp_smt_aware(void) +{ + return false; +} +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_none_mask; +} +#endif + +bool __weak arch_cpuhp_init_parallel_bringup(void) +{ + return true; +}
/* * On architectures which have enabled parallel bringup this invokes all BP @@@ -2705,7 -2689,9 +2705,7 @@@ int cpuhp_smt_disable(enum cpuhp_smt_co return ret; }
-/** - * Check if the core a CPU belongs to is online - */ +/* Check if the core a CPU belongs to is online */ #if !defined(topology_is_core_online) static inline bool topology_is_core_online(unsigned int cpu) { diff --combined kernel/time/hrtimer.c index 836157e09e25d,e834b2bd83df8..12eb40d6290ee --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@@ -1177,7 -1177,7 +1177,7 @@@ static inline ktime_t hrtimer_update_lo /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) @@@ -1351,11 -1351,13 +1351,13 @@@ static void hrtimer_cpu_base_init_expir }
static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) + __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); }
static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) + __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } @@@ -1757,7 -1759,7 +1759,7 @@@ static void __hrtimer_run_queues(struc } }
-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; @@@ -2072,14 -2074,9 +2074,9 @@@ long hrtimer_nanosleep(ktime_t rqtp, co struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@@ -2249,7 -2246,7 +2246,7 @@@ void __init hrtimers_init(void /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used */ @@@ -2276,13 -2273,6 +2273,6 @@@ schedule_hrtimeout_range_clock(ktime_t return -EINTR; }
- /* - * Override any slack passed by the user if under - * rt contraints. - */ - if (rt_task(current)) - delta = 0; - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@@ -2302,7 -2292,7 +2292,7 @@@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_ra /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has diff --combined kernel/time/timer.c index 760bbeb1f331f,2b38f3035a3ee..0fc9d066a7be4 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@@ -365,7 -365,7 +365,7 @@@ static unsigned long round_jiffies_comm rem = j % HZ;
/* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@@ -672,7 -672,7 +672,7 @@@ static void enqueue_timer(struct timer_ * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ - base->next_expiry = bucket_expiry; + WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); @@@ -1561,6 -1561,8 +1561,8 @@@ static inline void timer_base_unlock_ex * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) + __releases(&base->lock) __releases(&base->expiry_lock) + __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); @@@ -1898,7 -1900,7 +1900,7 @@@ static int next_pending_bucket(struct t * * Store next expiry time in base->next_expiry. */ - static void next_expiry_recalc(struct timer_base *base) + static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@@ -1928,7 -1930,7 +1930,7 @@@ * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@@ -1964,7 -1966,7 +1966,7 @@@ clk += adj; }
- base->next_expiry = next; + WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } @@@ -1993,7 -1995,7 +1995,7 @@@ static u64 cmp_next_hrtimer_event(u64 b return basem;
/* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@@ -2007,7 -2009,7 +2009,7 @@@ static unsigned long next_timer_interru unsigned long basej) { if (base->next_expiry_recalc) - next_expiry_recalc(base); + timer_recalc_next_expiry(base);
/* * Move next_expiry for the empty base into the future to prevent an @@@ -2018,7 -2020,7 +2020,7 @@@ * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; + WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
return base->next_expiry; } @@@ -2252,7 -2254,7 +2254,7 @@@ static inline u64 __get_next_timer_inte base_global, &tevt);
/* - * If the next event is only one jiffie ahead there is no need to call + * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. @@@ -2411,7 -2413,7 +2413,7 @@@ static inline void __run_timers(struct * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; - next_expiry_recalc(base); + timer_recalc_next_expiry(base);
while (levels--) expire_timers(base, heads + levels); @@@ -2440,7 -2442,7 +2442,7 @@@ static void run_timer_base(int index /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { @@@ -2462,8 -2464,40 +2464,40 @@@ static void run_local_timers(void hrtimer_run_queues();
for (int i = 0; i < NR_BASES; i++, base++) { - /* Raise the softirq only if required. */ - if (time_after_eq(jiffies, base->next_expiry) || + /* + * Raise the softirq only if required. + * + * timer_base::next_expiry can be written by a remote CPU while + * holding the lock. If this write happens at the same time than + * the lockless local read, sanity checker could complain about + * data corruption. + * + * There are two possible situations where + * timer_base::next_expiry is written by a remote CPU: + * + * 1. Remote CPU expires global timers of this CPU and updates + * timer_base::next_expiry of BASE_GLOBAL afterwards in + * next_timer_interrupt() or timer_recalc_next_expiry(). The + * worst outcome is a superfluous raise of the timer softirq + * when the not yet updated value is read. + * + * 2. A new first pinned timer is enqueued by a remote CPU + * and therefore timer_base::next_expiry of BASE_LOCAL is + * updated. When this update is missed, this isn't a + * problem, as an IPI is executed nevertheless when the CPU + * was idle before. When the CPU wasn't idle but the update + * is missed, then the timer would expire one jiffy late - + * bad luck. + * + * Those unlikely corner cases where the worst outcome is only a + * one jiffy delay or a superfluous raise of the softirq are + * not that expensive as doing the check always while holding + * the lock. + * + * Possible remote writers are using WRITE_ONCE(). Local reader + * uses therefore READ_ONCE(). + */ + if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_softirq(TIMER_SOFTIRQ); return; @@@ -2730,7 -2764,7 +2764,7 @@@ void __init init_timers(void */ void msleep(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs);
while (timeout) timeout = schedule_timeout_uninterruptible(timeout); @@@ -2744,7 -2778,7 +2778,7 @@@ EXPORT_SYMBOL(msleep) */ unsigned long msleep_interruptible(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs);
while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout);
linux-merge@lists.open-mesh.org