summaryrefslogtreecommitdiffstats
path: root/sound/core/pcm_compat.c
blob: 2b8aab6fd6cd312365a658c2aefd074a7768df64 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
/*
 *   32bit -> 64bit ioctl wrapper for PCM API
 *   Copyright (c) by Takashi Iwai <tiwai@suse.de>
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 *
 */

/* This file included from pcm_native.c */

#include <linux/compat.h>

static int snd_pcm_ioctl_delay_compat(struct snd_pcm_substream *substream,
				      s32 __user *src)
{
	snd_pcm_sframes_t delay;
	mm_segment_t fs;
	int err;

	fs = snd_enter_user();
	err = snd_pcm_delay(substream, &delay);
	snd_leave_user(fs);
	if (err < 0)
		return err;
	if (put_user(delay, src))
		return -EFAULT;
	return err;
}

static int snd_pcm_ioctl_rewind_compat(struct snd_pcm_substream *substream,
				       u32 __user *src)
{
	snd_pcm_uframes_t frames;
	int err;

	if (get_user(frames, src))
		return -EFAULT;
	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
		err = snd_pcm_playback_rewind(substream, frames);
	else
		err = snd_pcm_capture_rewind(substream, frames);
	if (put_user(err, src))
		return -EFAULT;
	return err < 0 ? err : 0;
}

static int snd_pcm_ioctl_forward_compat(struct snd_pcm_substream *substream,
				       u32 __user *src)
{
	snd_pcm_uframes_t frames;
	int err;

	if (get_user(frames, src))
		return -EFAULT;
	if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
		err = snd_pcm_playback_forward(substream, frames);
	else
		err = snd_pcm_capture_forward(substream, frames);
	if (put_user(err, src))
		return -EFAULT;
	return err < 0 ? err : 0;
}

struct snd_pcm_hw_params32 {
	u32 flags;
	struct snd_mask masks[SNDRV_PCM_HW_PARAM_LAST_MASK - SNDRV_PCM_HW_PARAM_FIRST_MASK + 1]; /* this must be identical */
	struct snd_mask mres[5];	/* reserved masks */
	struct snd_interval intervals[SNDRV_PCM_HW_PARAM_LAST_INTERVAL - SNDRV_PCM_HW_PARAM_FIRST_INTERVAL + 1];
	struct snd_interval ires[9];	/* reserved intervals */
	u32 rmask;
	u32 cmask;
	u32 info;
	u32 msbits;
	u32 rate_num;
	u32 rate_den;
	u32 fifo_size;
	unsigned char reserved[64];
};

struct snd_pcm_sw_params32 {
	s32 tstamp_mode;
	u32 period_step;
	u32 sleep_min;
	u32 avail_min;
	u32 xfer_align;
	u32 start_threshold;
	u32 stop_threshold;
	u32 silence_threshold;
	u32 silence_size;
	u32 boundary;
	unsigned char reserved[64];
};

/* recalcuate the boundary within 32bit */
static snd_pcm_uframes_t recalculate_boundary(struct snd_pcm_runtime *runtime)
{
	snd_pcm_uframes_t boundary;

	if (! runtime->buffer_size)
		return 0;
	boundary = runtime->buffer_size;
	while (boundary * 2 <= 0x7fffffffUL - runtime->buffer_size)
		boundary *= 2;
	return boundary;
}

static int snd_pcm_ioctl_sw_params_compat(struct snd_pcm_substream *substream,
					  struct snd_pcm_sw_params32 __user *src)
{
	struct snd_pcm_sw_params params;
	snd_pcm_uframes_t boundary;
	int err;

	memset(&params, 0, sizeof(params));
	if (get_user(params.tstamp_mode, &src->tstamp_mode) ||
	    get_user(params.period_step, &src->period_step) ||
	    get_user(params.sleep_min, &src->sleep_min) ||
	    get_user(params.avail_min, &src->avail_min) ||
	    get_user(params.xfer_align, &src->xfer_align) ||
	    get_user(params.start_threshold, &src->start_threshold) ||
	    get_user(params.stop_threshold, &src->stop_threshold) ||
	    get_user(params.silence_threshold, &src->silence_threshold) ||
	    get_user(params.silence_size, &src->silence_size))
		return -EFAULT;
	/*
	 * Check silent_size parameter.  Since we have 64bit boundary,
	 * silence_size must be compared with the 32bit boundary.
	 */
	boundary = recalculate_boundary(substream->runtime);
	if (boundary && params.silence_size >= boundary)
		params.silence_size = substream->runtime->boundary;
	err = snd_pcm_sw_params(substream, &params);
	if (err < 0)
		return err;
	if (boundary && put_user(boundary, &src->boundary))
		return -EFAULT;
	return err;
}

struct snd_pcm_channel_info32 {
	u32 channel;
	u32 offset;
	u32 first;
	u32 step;
};

static int snd_pcm_ioctl_channel_info_compat(struct snd_pcm_substream *substream,
					     struct snd_pcm_channel_info32 __user *src)
{
	struct snd_pcm_channel_info info;
	int err;

	if (get_user(info.channel, &src->channel) ||
	    get_user(info.offset, &src->offset) ||
	    get_user(info.first, &src->first) ||
	    get_user(info.step, &src->step))
		return -EFAULT;
	err = snd_pcm_channel_info(substream, &info);
	if (err < 0)
		return err;
	if (put_user(info.channel, &src->channel) ||
	    put_user(info.offset, &src->offset) ||
	    put_user(info.first, &src->first) ||
	    put_user(info.step, &src->step))
		return -EFAULT;
	return err;
}

struct snd_pcm_status32 {
	s32 state;
	struct compat_timespec trigger_tstamp;
	struct compat_timespec tstamp;
	u32 appl_ptr;
	u32 hw_ptr;
	s32 delay;
	u32 avail;
	u32 avail_max;
	u32 overrange;
	s32 suspended_state;
	unsigned char reserved[60];
} __attribute__((packed));


static int snd_pcm_status_user_compat(struct snd_pcm_substream *substream,
				      struct snd_pcm_status32 __user *src)
{
	struct snd_pcm_status status;
	int err;

	err = snd_pcm_status(substream, &status);
	if (err < 0)
		return err;

	if (put_user(status.state, &src->state) ||
	    put_user(status.trigger_tstamp.tv_sec, &src->trigger_tstamp.tv_sec) ||
	    put_user(status.trigger_tstamp.tv_nsec, &src->trigger_tstamp.tv_nsec) ||
	    put_user(status.tstamp.tv_sec, &src->tstamp.tv_sec) ||
	    put_user(status.tstamp.tv_nsec, &src->tstamp.tv_nsec) ||
	    put_user(status.appl_ptr, &src->appl_ptr) ||
	    put_user(status.hw_ptr, &src->hw_ptr) ||
	    put_user(status.delay, &src->delay) ||
	    put_user(status.avail, &src->avail) ||
	    put_user(status.avail_max, &src->avail_max) ||
	    put_user(status.overrange, &src->overrange) ||
	    put_user(status.suspended_state, &src->suspended_state))
		return -EFAULT;

	return err;
}

/* both for HW_PARAMS and HW_REFINE */
static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream,
					  int refine, 
					  struct snd_pcm_hw_params32 __user *data32)
{
	struct snd_pcm_hw_params *data;
	struct snd_pcm_runtime *runtime;
	int err;

	if (! (runtime = substream->runtime))
		return -ENOTTY;

	data = kmalloc(sizeof(*data), GFP_KERNEL);
	if (data == NULL)
		return -ENOMEM;
	/* only fifo_size is different, so just copy all */
	if (copy_from_user(data, data32, sizeof(*data32))) {
		err = -EFAULT;
		goto error;
	}
	if (refine)
		err = snd_pcm_hw_refine(substream, data);
	else
		err = snd_pcm_hw_params(substream, data);
	if (err < 0)
		goto error;
	if (copy_to_user(data32, data, sizeof(*data32)) ||
	    put_user(data->fifo_size, &data32->fifo_size)) {
		err = -EFAULT;
		goto error;
	}

	if (! refine) {
		unsigned int new_boundary = recalculate_boundary(runtime);
		if (new_boundary)
			runtime->boundary = new_boundary;
	}
 error:
	kfree(data);
	return err;
}


/*
 */
struct snd_xferi32 {
	s32 result;
	u32 buf;
	u32 frames;
};

static int snd_pcm_ioctl_xferi_compat(struct snd_pcm_substream *substream,
				      int dir, struct snd_xferi32 __user *data32)
{
	compat_caddr_t buf;
	u32 frames;
	int err;

	if (! substream->runtime)
		return -ENOTTY;
	if (substream->stream != dir)
		return -EINVAL;
	if (substream->runtime->status->state == SNDRV_PCM_STATE_OPEN)
		return -EBADFD;

	if (get_user(buf, &data32->buf) ||
	    get_user(frames, &data32->frames))
		return -EFAULT;

	if (dir == SNDRV_PCM_STREAM_PLAYBACK)
		err = snd_pcm_lib_write(substream, compat_ptr(buf), frames);
	else
		err = snd_pcm_lib_read(substream, compat_ptr(buf), frames);
	if (err < 0)
		return err;
	/* copy the result */
	if (put_user(err, &data32->result))
		return -EFAULT;
	return 0;
}


/* snd_xfern needs remapping of bufs */
struct snd_xfern32 {
	s32 result;
	u32 bufs;  /* this is void **; */
	u32 frames;
};

/*
 * xfern ioctl nees to copy (up to) 128 pointers on stack.
 * although we may pass the copied pointers through f_op->ioctl, but the ioctl
 * handler there expands again the same 128 pointers on stack, so it is better
 * to handle the function (calling pcm_readv/writev) directly in this handler.
 */
static int snd_pcm_ioctl_xfern_compat(struct snd_pcm_substream *substream,
				      int dir, struct snd_xfern32 __user *data32)
{
	compat_caddr_t buf;
	compat_caddr_t __user *bufptr;
	u32 frames;
	void __user **bufs;
	int err, ch, i;

	if (! substream->runtime)
		return -ENOTTY;
	if (substream->stream != dir)
		return -EINVAL;

	if ((ch = substream->runtime->channels) > 128)
		return -EINVAL;
	if (get_user(buf, &data32->bufs) ||
	    get_user(frames, &data32->frames))
		return -EFAULT;
	bufptr = compat_ptr(buf);
	bufs = kmalloc(sizeof(void __user *) * ch, GFP_KERNEL);
	if (bufs == NULL)
		return -ENOMEM;
	for (i = 0; i < ch; i++) {
		u32 ptr;
		if (get_user(ptr, bufptr)) {
			kfree(bufs);
			return -EFAULT;
		}
		bufs[ch] = compat_ptr(ptr);
		bufptr++;
	}
	if (dir == SNDRV_PCM_STREAM_PLAYBACK)
		err = snd_pcm_lib_writev(substream, bufs, frames);
	else
		err = snd_pcm_lib_readv(substream, bufs, frames);
	if (err >= 0) {
		if (put_user(err, &data32->result))
			err = -EFAULT;
	}
	kfree(bufs);
	return err;
}


struct snd_pcm_mmap_status32 {
	s32 state;
	s32 pad1;
	u32 hw_ptr;
	struct compat_timespec tstamp;
	s32 suspended_state;
} __attribute__((packed));

struct snd_pcm_mmap_control32 {
	u32 appl_ptr;
	u32 avail_min;
};

struct snd_pcm_sync_ptr32 {
	u32 flags;
	union {
		struct snd_pcm_mmap_status32 status;
		unsigned char reserved[64];
	} s;
	union {
		struct snd_pcm_mmap_control32 control;
		unsigned char reserved[64];
	} c;
} __attribute__((packed));

static int snd_pcm_ioctl_sync_ptr_compat(struct snd_pcm_substream *substream,
					 struct snd_pcm_sync_ptr32 __user *src)
{
	struct snd_pcm_runtime *runtime = substream->runtime;
	volatile struct snd_pcm_mmap_status *status;
	volatile struct snd_pcm_mmap_control *control;
	u32 sflags;
	struct snd_pcm_mmap_control scontrol;
	struct snd_pcm_mmap_status sstatus;
	snd_pcm_uframes_t boundary;
	int err;

	snd_assert(runtime, return -EINVAL);

	if (get_user(sflags, &src->flags) ||
	    get_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
	    get_user(scontrol.avail_min, &src->c.control.avail_min))
		return -EFAULT;
	if (sflags & SNDRV_PCM_SYNC_PTR_HWSYNC) {
		err = snd_pcm_hwsync(substream);
		if (err < 0)
			return err;
	}
	status = runtime->status;
	control = runtime->control;
	boundary = recalculate_boundary(runtime);
	if (! boundary)
		boundary = 0x7fffffff;
	snd_pcm_stream_lock_irq(substream);
	/* FIXME: we should consider the boundary for the sync from app */
	if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL))
		control->appl_ptr = scontrol.appl_ptr;
	else
		scontrol.appl_ptr = control->appl_ptr % boundary;
	if (!(sflags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN))
		control->avail_min = scontrol.avail_min;
	else
		scontrol.avail_min = control->avail_min;
	sstatus.state = status->state;
	sstatus.hw_ptr = status->hw_ptr % boundary;
	sstatus.tstamp = status->tstamp;
	sstatus.suspended_state = status->suspended_state;
	snd_pcm_stream_unlock_irq(substream);
	if (put_user(sstatus.state, &src->s.status.state) ||
	    put_user(sstatus.hw_ptr, &src->s.status.hw_ptr) ||
	    put_user(sstatus.tstamp.tv_sec, &src->s.status.tstamp.tv_sec) ||
	    put_user(sstatus.tstamp.tv_nsec, &src->s.status.tstamp.tv_nsec) ||
	    put_user(sstatus.suspended_state, &src->s.status.suspended_state) ||
	    put_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
	    put_user(scontrol.avail_min, &src->c.control.avail_min))
		return -EFAULT;

	return 0;
}


/*
 */
enum {
	SNDRV_PCM_IOCTL_HW_REFINE32 = _IOWR('A', 0x10, struct snd_pcm_hw_params32),
	SNDRV_PCM_IOCTL_HW_PARAMS32 = _IOWR('A', 0x11, struct snd_pcm_hw_params32),
	SNDRV_PCM_IOCTL_SW_PARAMS32 = _IOWR('A', 0x13, struct snd_pcm_sw_params32),
	SNDRV_PCM_IOCTL_STATUS32 = _IOR('A', 0x20, struct snd_pcm_status32),
	SNDRV_PCM_IOCTL_DELAY32 = _IOR('A', 0x21, s32),
	SNDRV_PCM_IOCTL_CHANNEL_INFO32 = _IOR('A', 0x32, struct snd_pcm_channel_info32),
	SNDRV_PCM_IOCTL_REWIND32 = _IOW('A', 0x46, u32),
	SNDRV_PCM_IOCTL_FORWARD32 = _IOW('A', 0x49, u32),
	SNDRV_PCM_IOCTL_WRITEI_FRAMES32 = _IOW('A', 0x50, struct snd_xferi32),
	SNDRV_PCM_IOCTL_READI_FRAMES32 = _IOR('A', 0x51, struct snd_xferi32),
	SNDRV_PCM_IOCTL_WRITEN_FRAMES32 = _IOW('A', 0x52, struct snd_xfern32),
	SNDRV_PCM_IOCTL_READN_FRAMES32 = _IOR('A', 0x53, struct snd_xfern32),
	SNDRV_PCM_IOCTL_SYNC_PTR32 = _IOWR('A', 0x23, struct snd_pcm_sync_ptr32),

};

static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
{
	struct snd_pcm_file *pcm_file;
	struct snd_pcm_substream *substream;
	void __user *argp = compat_ptr(arg);

	pcm_file = file->private_data;
	if (! pcm_file)
		return -ENOTTY;
	substream = pcm_file->substream;
	if (! substream)
		return -ENOTTY;

	/*
	 * When PCM is used on 32bit mode, we need to disable
	 * mmap of PCM status/control records because of the size
	 * incompatibility.
	 */
	substream->no_mmap_ctrl = 1;

	switch (cmd) {
	case SNDRV_PCM_IOCTL_PVERSION:
	case SNDRV_PCM_IOCTL_INFO:
	case SNDRV_PCM_IOCTL_TSTAMP:
	case SNDRV_PCM_IOCTL_HWSYNC:
	case SNDRV_PCM_IOCTL_PREPARE:
	case SNDRV_PCM_IOCTL_RESET:
	case SNDRV_PCM_IOCTL_START:
	case SNDRV_PCM_IOCTL_DROP:
	case SNDRV_PCM_IOCTL_DRAIN:
	case SNDRV_PCM_IOCTL_PAUSE:
	case SNDRV_PCM_IOCTL_HW_FREE:
	case SNDRV_PCM_IOCTL_RESUME:
	case SNDRV_PCM_IOCTL_XRUN:
	case SNDRV_PCM_IOCTL_LINK:
	case SNDRV_PCM_IOCTL_UNLINK:
		if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
			return snd_pcm_playback_ioctl1(file, substream, cmd, argp);
		else
			return snd_pcm_capture_ioctl1(file, substream, cmd, argp);
	case SNDRV_PCM_IOCTL_HW_REFINE32:
		return snd_pcm_ioctl_hw_params_compat(substream, 1, argp);
	case SNDRV_PCM_IOCTL_HW_PARAMS32:
		return snd_pcm_ioctl_hw_params_compat(substream, 0, argp);
	case SNDRV_PCM_IOCTL_SW_PARAMS32:
		return snd_pcm_ioctl_sw_params_compat(substream, argp);
	case SNDRV_PCM_IOCTL_STATUS32:
		return snd_pcm_status_user_compat(substream, argp);
	case SNDRV_PCM_IOCTL_SYNC_PTR32:
		return snd_pcm_ioctl_sync_ptr_compat(substream, argp);
	case SNDRV_PCM_IOCTL_CHANNEL_INFO32:
		return snd_pcm_ioctl_channel_info_compat(substream, argp);
	case SNDRV_PCM_IOCTL_WRITEI_FRAMES32:
		return snd_pcm_ioctl_xferi_compat(substream, SNDRV_PCM_STREAM_PLAYBACK, argp);
	case SNDRV_PCM_IOCTL_READI_FRAMES32:
		return snd_pcm_ioctl_xferi_compat(substream, SNDRV_PCM_STREAM_CAPTURE, argp);
	case SNDRV_PCM_IOCTL_WRITEN_FRAMES32:
		return snd_pcm_ioctl_xfern_compat(substream, SNDRV_PCM_STREAM_PLAYBACK, argp);
	case SNDRV_PCM_IOCTL_READN_FRAMES32:
		return snd_pcm_ioctl_xfern_compat(substream, SNDRV_PCM_STREAM_CAPTURE, argp);
	case SNDRV_PCM_IOCTL_DELAY32:
		return snd_pcm_ioctl_delay_compat(substream, argp);
	case SNDRV_PCM_IOCTL_REWIND32:
		return snd_pcm_ioctl_rewind_compat(substream, argp);
	case SNDRV_PCM_IOCTL_FORWARD32:
		return snd_pcm_ioctl_forward_compat(substream, argp);
	}

	return -ENOIOCTLCMD;
}
column1'>| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | After long time running of random small IO writing, I reboot the machine, and after the machine power on, I found bcache got stuck, the stack is: [root@ceph153 ~]# cat /proc/2510/task/*/stack [<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache] [<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache] [<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache] [<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache] [<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache] [<ffffffff810a631f>] kthread+0xcf/0xe0 [<ffffffff8164c318>] ret_from_fork+0x58/0x90 [<ffffffffffffffff>] 0xffffffffffffffff [root@ceph153 ~]# cat /proc/2038/task/*/stack [<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache] [<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache] [<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache] [<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache] [<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache] [<ffffffff812f702f>] kobj_attr_store+0xf/0x20 [<ffffffff8125b216>] sysfs_write_file+0xc6/0x140 [<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0 [<ffffffff811e069f>] SyS_write+0x7f/0xe0 [<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1 The stack shows the register thread and allocator thread were getting stuck when registering cache device. I reboot the machine several times, the issue always exsit in this machine. I debug the code, and found the call trace as bellow: register_bcache() ==>run_cache_set() ==>bch_journal_replay() ==>bch_btree_insert() ==>__bch_btree_map_nodes() ==>btree_insert_fn() ==>btree_split() //node need split ==>btree_check_reserve() In btree_check_reserve(), It will check if there is enough buckets of RESERVE_BTREE type, since allocator thread did not work yet, so no buckets of RESERVE_BTREE type allocated, so the register thread waits on c->btree_cache_wait, and goes to sleep. Then the allocator thread initialized, the call trace is bellow: bch_allocator_thread() ==>bch_prio_write() ==>bch_journal_meta() ==>bch_journal() ==>journal_wait_for_write() In journal_wait_for_write(), It will check if journal is full by journal_full(), but the long time random small IO writing causes the exhaustion of journal buckets(journal.blocks_free=0), In order to release the journal buckets, the allocator calls btree_flush_write() to flush keys to btree nodes, and waits on c->journal.wait until btree nodes writing over or there has already some journal buckets space, then the allocator thread goes to sleep. but in btree_flush_write(), since bch_journal_replay() is not finished, so no btree nodes have journal (condition "if (btree_current_write(b)->journal)" never satisfied), so we got no btree node to flush, no journal bucket released, and allocator sleep all the times. Through the above analysis, we can see that: 1) Register thread wait for allocator thread to allocate buckets of RESERVE_BTREE type; 2) Alloctor thread wait for register thread to replay journal, so it can flush btree nodes and get journal bucket. then they are all got stuck by waiting for each other. Hua Rui provided a patch for me, by allocating some buckets of RESERVE_BTREE type in advance, so the register thread can get bucket when btree node splitting and no need to waiting for the allocator thread. I tested it, it has effect, and register thread run a step forward, but finally are still got stuck, the reason is only 8 bucket of RESERVE_BTREE type were allocated, and in bch_journal_replay(), after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left, then btree_check_reserve() is not satisfied anymore, so it goes to sleep again, and in the same time, alloctor thread did not flush enough btree nodes to release a journal bucket, so they all got stuck again. So we need to allocate more buckets of RESERVE_BTREE type in advance, but how much is enough? By experience and test, I think it should be as much as journal buckets. Then I modify the code as this patch, and test in the machine, and it works. This patch modified base on Hua Rui’s patch, and allocate more buckets of RESERVE_BTREE type in advance to avoid register thread and allocate thread going to wait for each other. [patch v2] ca->sb.njournal_buckets would be 0 in the first time after cache creation, and no journal exists, so just 8 btree buckets is OK. Signed-off-by: Hua Rui <huarui.dev@gmail.com> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | bcache: set error_limit correctlyColy Li2018-02-073-3/+4 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Struct cache uses io_errors for two purposes, - Error decay: when cache set error_decay is set, io_errors is used to generate a small piece of delay when I/O error happens. - I/O errors counter: in order to generate big enough value for error decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a IO_ERROR_SHIFT). In function bch_count_io_errors(), if I/O errors counter reaches cache set error limit, bch_cache_set_error() will be called to retire the whold cache set. But current code is problematic when checking the error limit, see the following code piece from bch_count_io_errors(), 90 if (error) { 91 char buf[BDEVNAME_SIZE]; 92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, 93 &ca->io_errors); 94 errors >>= IO_ERROR_SHIFT; 95 96 if (errors < ca->set->error_limit) 97 pr_err("%s: IO error on %s, recovering", 98 bdevname(ca->bdev, buf), m); 99 else 100 bch_cache_set_error(ca->set, 101 "%s: too many IO errors %s", 102 bdevname(ca->bdev, buf), m); 103 } At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real errors counter to compare at line 96. But ca->set->error_limit is initia- lized with an amplified value in bch_cache_set_alloc(), 1545 c->error_limit = 8 << IO_ERROR_SHIFT; It means by default, in bch_count_io_errors(), before 8<<20 errors happened bch_cache_set_error() won't be called to retire the problematic cache device. If the average request size is 64KB, it means bcache won't handle failed device until 512GB data is requested. This is too large to be an I/O threashold. So I believe the correct error limit should be much less. This patch sets default cache set error limit to 8, then in bch_count_io_errors() when errors counter reaches 8 (if it is default value), function bch_cache_set_error() will be called to retire the whole cache set. This patch also removes bits shifting when store or show io_error_limit value via sysfs interface. Nowadays most of SSDs handle internal flash failure automatically by LBA address re-indirect mapping. If an I/O error can be observed by upper layer code, it will be a notable error because that SSD can not re-indirect map the problematic LBA address to an available flash block. This situation indicates the whole SSD will be failed very soon. Therefore setting 8 as the default io error limit value makes sense, it is enough for most of cache devices. Changelog: v2: add reviewed-by from Hannes. v1: initial version for review. Signed-off-by: Coly Li <colyli@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.com> Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: Junhui Tang <tang.junhui@zte.com.cn> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | bcache: properly set task state in bch_writeback_thread()Coly Li2018-02-072-3/+8 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Kernel thread routine bch_writeback_thread() has the following code block, 447 down_write(&dc->writeback_lock); 448~450 if (check conditions) { 451 up_write(&dc->writeback_lock); 452 set_current_state(TASK_INTERRUPTIBLE); 453 454 if (kthread_should_stop()) 455 return 0; 456 457 schedule(); 458 continue; 459 } If condition check is true, its task state is set to TASK_INTERRUPTIBLE and call schedule() to wait for others to wake up it. There are 2 issues in current code, 1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if another process changes the condition and call wake_up_process(dc-> writeback_thread), then at line 452 task state is set back to TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be waken up. 2, At line 454 if kthread_should_stop() is true, writeback kernel thread will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and call do_exit(). It is not good to enter do_exit() with task state TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a warning message is reported by __might_sleep(): "WARNING: do not call blocking ops when !TASK_RUNNING; state=1 set at [xxxx]". For the first issue, task state should be set before condition checks. Ineed because dc->writeback_lock is required when modifying all the conditions, calling set_current_state() inside code block where dc-> writeback_lock is hold is safe. But this is quite implicit, so I still move set_current_state() before all the condition checks. For the second issue, frankley speaking it does not hurt when kernel thread exits with TASK_INTERRUPTIBLE state, but this warning message scares users, makes them feel there might be something risky with bcache and hurt their data. Setting task state to TASK_RUNNING before returning fixes this problem. In alloc.c:allocator_wait(), there is also a similar issue, and is also fixed in this patch. Changelog: v3: merge two similar fixes into one patch v2: fix the race issue in v1 patch. v1: initial buggy fix. Signed-off-by: Coly Li <colyli@suse.de> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Michael Lyle <mlyle@lyle.org> Cc: Michael Lyle <mlyle@lyle.org> Cc: Junhui Tang <tang.junhui@zte.com.cn> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | bcache: fix high CPU occupancy during journalTang Junhui2018-02-073-15/+36 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | After long time small writing I/O running, we found the occupancy of CPU is very high and I/O performance has been reduced by about half: [root@ceph151 internal]# top top - 15:51:05 up 1 day,2:43, 4 users, load average: 16.89, 15.15, 16.53 Tasks: 2063 total, 4 running, 2059 sleeping, 0 stopped, 0 zombie %Cpu(s):4.3 us, 17.1 sy 0.0 ni, 66.1 id, 12.0 wa, 0.0 hi, 0.5 si, 0.0 st KiB Mem : 65450044 total, 24586420 free, 38909008 used, 1954616 buff/cache KiB Swap: 65667068 total, 65667068 free, 0 used. 25136812 avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2023 root 20 0 0 0 0 S 55.1 0.0 0:04.42 kworker/11:191 14126 root 20 0 0 0 0 S 42.9 0.0 0:08.72 kworker/10:3 9292 root 20 0 0 0 0 S 30.4 0.0 1:10.99 kworker/6:1 8553 ceph 20 0 4242492 1.805g 18804 S 30.0 2.9 410:07.04 ceph-osd 12287 root 20 0 0 0 0 S 26.7 0.0 0:28.13 kworker/7:85 31019 root 20 0 0 0 0 S 26.1 0.0 1:30.79 kworker/22:1 1787 root 20 0 0 0 0 R 25.7 0.0 5:18.45 kworker/8:7 32169 root 20 0 0 0 0 S 14.5 0.0 1:01.92 kworker/23:1 21476 root 20 0 0 0 0 S 13.9 0.0 0:05.09 kworker/1:54 2204 root 20 0 0 0 0 S 12.5 0.0 1:25.17 kworker/9:10 16994 root 20 0 0 0 0 S 12.2 0.0 0:06.27 kworker/5:106 15714 root 20 0 0 0 0 R 10.9 0.0 0:01.85 kworker/19:2 9661 ceph 20 0 4246876 1.731g 18800 S 10.6 2.8 403:00.80 ceph-osd 11460 ceph 20 0 4164692 2.206g 18876 S 10.6 3.5 360:27.19 ceph-osd 9960 root 20 0 0 0 0 S 10.2 0.0 0:02.75 kworker/2:139 11699 ceph 20 0 4169244 1.920g 18920 S 10.2 3.1 355:23.67 ceph-osd 6843 ceph 20 0 4197632 1.810g 18900 S 9.6 2.9 380:08.30 ceph-osd The kernel work consumed a lot of CPU, and I found they are running journal work, The journal is reclaiming source and flush btree node with surprising frequency. Through further analysis, we found that in btree_flush_write(), we try to get a btree node with the smallest fifo idex to flush by traverse all the btree nodein c->bucket_hash, after we getting it, since no locker protects it, this btree node may have been written to cache device by other works, and if this occurred, we retry to traverse in c->bucket_hash and get another btree node. When the problem occurrd, the retry times is very high, and we consume a lot of CPU in looking for a appropriate btree node. In this patch, we try to record 128 btree nodes with the smallest fifo idex in heap, and pop one by one when we need to flush btree node. It greatly reduces the time for the loop to find the appropriate BTREE node, and also reduce the occupancy of CPU. [note by mpl: this triggers a checkpatch error because of adjacent, pre-existing style violations] Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | bcache: add journal statisticTang Junhui2018-02-073-0/+24 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Sometimes, Journal takes up a lot of CPU, we need statistics to know what's the journal is doing. So this patch provide some journal statistics: 1) reclaim: how many times the journal try to reclaim resource, usually the journal bucket or/and the pin are exhausted. 2) flush_write: how many times the journal try to flush btree node to cache device, usually the journal bucket are exhausted. 3) retry_flush_write: how many times the journal retry to flush the next btree node, usually the previous tree node have been flushed by other thread. we show these statistic by sysfs interface. Through these statistics We can totally see the status of journal module when the CPU is too high. Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn> Reviewed-by: Michael Lyle <mlyle@lyle.org> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | block: Add should_fail_bio() for bpf error injectionHoward McLauchlan2018-02-061-1/+10 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The classic error injection mechanism, should_fail_request() does not support use cases where more information is required (from the entire struct bio, for example). To that end, this patch introduces should_fail_bio(), which calls should_fail_request() under the hood but provides a convenient place for kprobes to hook into if they require the entire struct bio. This patch also replaces some existing calls to should_fail_request() with should_fail_bio() with no degradation in performance. Signed-off-by: Howard McLauchlan <hmclauchlan@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk> | | * | | | blk-wbt: account flush requests correctlyJens Axboe2018-02-061-1/+9 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Mikulas reported a workload that saw bad performance, and figured out what it was due to various other types of requests being accounted as reads. Flush requests, for instance. Due to the high latency of those, we heavily throttle the writes to keep the latencies in balance. But they really should be accounted as writes. Fix this by checking the exact type of the request. If it's a read, account as a read, if it's a write or a flush, account as a write. Any other request we disregard. Previously everything would have been mistakenly accounted as reads. Reported-by: Mikulas Patocka <mpatocka@redhat.com> Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Jens Axboe <axboe@kernel.dk> | * | | | | Merge branch 'master' into testJens Axboe2018-02-07