summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/syscall_64.c
blob: 79edba3ab3124a2df72c77c07f025b9cacf66a98 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/err.h>
#include <asm/asm-prototypes.h>
#include <asm/book3s/64/kup-radix.h>
#include <asm/cputime.h>
#include <asm/hw_irq.h>
#include <asm/kprobes.h>
#include <asm/paca.h>
#include <asm/ptrace.h>
#include <asm/reg.h>
#include <asm/signal.h>
#include <asm/switch_to.h>
#include <asm/syscall.h>
#include <asm/time.h>
#include <asm/unistd.h>

typedef long (*syscall_fn)(long, long, long, long, long, long);

/* Has to run notrace because it is entered not completely "reconciled" */
notrace long system_call_exception(long r3, long r4, long r5,
				   long r6, long r7, long r8,
				   unsigned long r0, struct pt_regs *regs)
{
	syscall_fn f;

	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
		BUG_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);

	trace_hardirqs_off(); /* finish reconciling */

	if (IS_ENABLED(CONFIG_PPC_BOOK3S))
		BUG_ON(!(regs->msr & MSR_RI));
	BUG_ON(!(regs->msr & MSR_PR));
	BUG_ON(!FULL_REGS(regs));
	BUG_ON(regs->softe != IRQS_ENABLED);

	kuap_check_amr();

	account_cpu_user_entry();

#ifdef CONFIG_PPC_SPLPAR
	if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
	    firmware_has_feature(FW_FEATURE_SPLPAR)) {
		struct lppaca *lp = local_paca->lppaca_ptr;

		if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
			accumulate_stolen_time();
	}
#endif

	/*
	 * This is not required for the syscall exit path, but makes the
	 * stack frame look nicer. If this was initialised in the first stack
	 * frame, or if the unwinder was taught the first stack frame always
	 * returns to user with IRQS_ENABLED, this store could be avoided!
	 */
	regs->softe = IRQS_ENABLED;

	local_irq_enable();

	if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) {
		/*
		 * We use the return value of do_syscall_trace_enter() as the
		 * syscall number. If the syscall was rejected for any reason
		 * do_syscall_trace_enter() returns an invalid syscall number
		 * and the test against NR_syscalls will fail and the return
		 * value to be used is in regs->gpr[3].
		 */
		r0 = do_syscall_trace_enter(regs);
		if (unlikely(r0 >= NR_syscalls))
			return regs->gpr[3];
		r3 = regs->gpr[3];
		r4 = regs->gpr[4];
		r5 = regs->gpr[5];
		r6 = regs->gpr[6];
		r7 = regs->gpr[7];
		r8 = regs->gpr[8];

	} else if (unlikely(r0 >= NR_syscalls)) {
		return -ENOSYS;
	}

	/* May be faster to do array_index_nospec? */
	barrier_nospec();

	if (unlikely(is_32bit_task())) {
		f = (void *)compat_sys_call_table[r0];

		r3 &= 0x00000000ffffffffULL;
		r4 &= 0x00000000ffffffffULL;
		r5 &= 0x00000000ffffffffULL;
		r6 &= 0x00000000ffffffffULL;
		r7 &= 0x00000000ffffffffULL;
		r8 &= 0x00000000ffffffffULL;

	} else {
		f = (void *)sys_call_table[r0];
	}

	return f(r3, r4, r5, r6, r7, r8);
}

/*
 * local irqs must be disabled. Returns false if the caller must re-enable
 * them, check for new work, and try again.
 */
static notrace inline bool prep_irq_for_enabled_exit(void)
{
	/* This must be done with RI=1 because tracing may touch vmaps */
	trace_hardirqs_on();

	/* This pattern matches prep_irq_for_idle */
	__hard_EE_RI_disable();
	if (unlikely(lazy_irq_pending_nocheck())) {
		/* Took an interrupt, may have more exit work to do. */
		__hard_RI_enable();
		trace_hardirqs_off();
		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;

		return false;
	}
	local_paca->irq_happened = 0;
	irq_soft_mask_set(IRQS_ENABLED);

	return true;
}

/*
 * This should be called after a syscall returns, with r3 the return value
 * from the syscall. If this function returns non-zero, the system call
 * exit assembly should additionally load all GPR registers and CTR and XER
 * from the interrupt frame.
 *
 * The function graph tracer can not trace the return side of this function,
 * because RI=0 and soft mask state is "unreconciled", so it is marked notrace.
 */
notrace unsigned long syscall_exit_prepare(unsigned long r3,
					   struct pt_regs *regs)
{
	unsigned long *ti_flagsp = &current_thread_info()->flags;
	unsigned long ti_flags;
	unsigned long ret = 0;

	kuap_check_amr();

	regs->result = r3;

	/* Check whether the syscall is issued inside a restartable sequence */
	rseq_syscall(regs);

	ti_flags = *ti_flagsp;

	if (unlikely(r3 >= (unsigned long)-MAX_ERRNO)) {
		if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) {
			r3 = -r3;
			regs->ccr |= 0x10000000; /* Set SO bit in CR */
		}
	}

	if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) {
		if (ti_flags & _TIF_RESTOREALL)
			ret = _TIF_RESTOREALL;
		else
			regs->gpr[3] = r3;
		clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp);
	} else {
		regs->gpr[3] = r3;
	}

	if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) {
		do_syscall_trace_leave(regs);
		ret |= _TIF_RESTOREALL;
	}

again:
	local_irq_disable();
	ti_flags = READ_ONCE(*ti_flagsp);
	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
		local_irq_enable();
		if (ti_flags & _TIF_NEED_RESCHED) {
			schedule();
		} else {
			/*
			 * SIGPENDING must restore signal handler function
			 * argument GPRs, and some non-volatiles (e.g., r1).
			 * Restore all for now. This could be made lighter.
			 */
			if (ti_flags & _TIF_SIGPENDING)
				ret |= _TIF_RESTOREALL;
			do_notify_resume(regs, ti_flags);
		}
		local_irq_disable();
		ti_flags = READ_ONCE(*ti_flagsp);
	}

	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
		if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
				unlikely((ti_flags & _TIF_RESTORE_TM))) {
			restore_tm_state(regs);
		} else {
			unsigned long mathflags = MSR_FP;

			if (cpu_has_feature(CPU_FTR_VSX))
				mathflags |= MSR_VEC | MSR_VSX;
			else if (cpu_has_feature(CPU_FTR_ALTIVEC))
				mathflags |= MSR_VEC;

			if ((regs->msr & mathflags) != mathflags)
				restore_math(regs);
		}
	}

	if (unlikely(!prep_irq_for_enabled_exit())) {
		local_irq_enable();
		goto again;
	}

#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	local_paca->tm_scratch = regs->msr;
#endif

	account_cpu_user_exit();

	return ret;
}

#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */
notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr)
{
#ifdef CONFIG_PPC_BOOK3E
	struct thread_struct *ts = &current->thread;
#endif
	unsigned long *ti_flagsp = &current_thread_info()->flags;
	unsigned long ti_flags;
	unsigned long flags;
	unsigned long ret = 0;

	if (IS_ENABLED(CONFIG_PPC_BOOK3S))
		BUG_ON(!(regs->msr & MSR_RI));
	BUG_ON(!(regs->msr & MSR_PR));
	BUG_ON(!FULL_REGS(regs));
	BUG_ON(regs->softe != IRQS_ENABLED);

	/*
	 * We don't need to restore AMR on the way back to userspace for KUAP.
	 * AMR can only have been unlocked if we interrupted the kernel.
	 */
	kuap_check_amr();

	local_irq_save(flags);

again:
	ti_flags = READ_ONCE(*ti_flagsp);
	while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
		local_irq_enable(); /* returning to user: may enable */
		if (ti_flags & _TIF_NEED_RESCHED) {
			schedule();
		} else {
			if (ti_flags & _TIF_SIGPENDING)
				ret |= _TIF_RESTOREALL;
			do_notify_resume(regs, ti_flags);
		}
		local_irq_disable();
		ti_flags = READ_ONCE(*ti_flagsp);
	}

	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
		if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
				unlikely((ti_flags & _TIF_RESTORE_TM))) {
			restore_tm_state(regs);
		} else {
			unsigned long mathflags = MSR_FP;

			if (cpu_has_feature(CPU_FTR_VSX))
				mathflags |= MSR_VEC | MSR_VSX;
			else if (cpu_has_feature(CPU_FTR_ALTIVEC))
				mathflags |= MSR_VEC;

			if ((regs->msr & mathflags) != mathflags)
				restore_math(regs);
		}
	}

	if (unlikely(!prep_irq_for_enabled_exit())) {
		local_irq_enable();
		local_irq_disable();
		goto again;
	}

#ifdef CONFIG_PPC_BOOK3E
	if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) {
		/*
		 * Check to see if the dbcr0 register is set up to debug.
		 * Use the internal debug mode bit to do this.
		 */
		mtmsr(mfmsr() & ~MSR_DE);
		mtspr(SPRN_DBCR0, ts->debug.dbcr0);
		mtspr(SPRN_DBSR, -1);
	}
#endif

#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	local_paca->tm_scratch = regs->msr;
#endif

	account_cpu_user_exit();

	return ret;
}

void unrecoverable_exception(struct pt_regs *regs);
void preempt_schedule_irq(void);

notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr)
{
	unsigned long *ti_flagsp = &current_thread_info()->flags;
	unsigned long flags;
	unsigned long ret = 0;
	unsigned long amr;

	if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI)))
		unrecoverable_exception(regs);
	BUG_ON(regs->msr & MSR_PR);
	BUG_ON(!FULL_REGS(regs));

	amr = kuap_get_and_check_amr();

	if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) {
		clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp);
		ret = 1;
	}

	local_irq_save(flags);

	if (regs->softe == IRQS_ENABLED) {
		/* Returning to a kernel context with local irqs enabled. */
		WARN_ON_ONCE(!(regs->msr & MSR_EE));
again:
		if (IS_ENABLED(CONFIG_PREEMPT)) {
			/* Return to preemptible kernel context */
			if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) {
				if (preempt_count() == 0)
					preempt_schedule_irq();
			}
		}

		if (unlikely(!prep_irq_for_enabled_exit())) {
			/*
			 * Can't local_irq_restore to replay if we were in
			 * interrupt context. Must replay directly.
			 */
			if (irqs_disabled_flags(flags)) {
				replay_soft_interrupts();
			} else {
				local_irq_restore(flags);
				local_irq_save(flags);
			}
			/* Took an interrupt, may have more exit work to do. */
			goto again;
		}
	} else {
		/* Returning to a kernel context with local irqs disabled. */
		__hard_EE_RI_disable();
		if (regs->msr & MSR_EE)
			local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
	}


#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
	local_paca->tm_scratch = regs->msr;
#endif

	/*
	 * Don't want to mfspr(SPRN_AMR) here, because this comes after mtmsr,
	 * which would cause Read-After-Write stalls. Hence, we take the AMR
	 * value from the check above.
	 */
	kuap_restore_amr(regs, amr);

	return ret;
}
#endif