fs/orangefs/waitqueue.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357

/*
 * (C) 2001 Clemson University and The University of Chicago
 * (C) 2011 Omnibond Systems
 *
 * Changes by Acxiom Corporation to implement generic service_operation()
 * function, Copyright Acxiom Corporation, 2005.
 *
 * See COPYING in top-level directory.
 */

/*
 *  In-kernel waitqueue operations.
 */

#include "protocol.h"
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);

/*
 * What we do in this function is to walk the list of operations that are
 * present in the request queue and mark them as purged.
 * NOTE: This is called from the device close after client-core has
 * guaranteed that no new operations could appear on the list since the
 * client-core is anyway going to exit.
 */
void purge_waiting_ops(void)
{
	struct orangefs_kernel_op_s *op;

	spin_lock(&orangefs_request_list_lock);
	list_for_each_entry(op, &orangefs_request_list, list) {
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "pvfs2-client-core: purging op tag %llu %s\n",
			     llu(op->tag),
			     get_opname_string(op));
		set_op_state_purged(op);
		gossip_debug(GOSSIP_DEV_DEBUG,
			     "%s: op:%s: op_state:%d: process:%s:\n",
			     __func__,
			     get_opname_string(op),
			     op->op_state,
			     current->comm);
	}
	spin_unlock(&orangefs_request_list_lock);
}

/*
 * submits a ORANGEFS operation and waits for it to complete
 *
 * Note op->downcall.status will contain the status of the operation (in
 * errno format), whether provided by pvfs2-client or a result of failure to
 * service the operation.  If the caller wishes to distinguish, then
 * op->state can be checked to see if it was serviced or not.
 *
 * Returns contents of op->downcall.status for convenience
 */
int service_operation(struct orangefs_kernel_op_s *op,
		      const char *op_name,
		      int flags)
{
	long timeout = MAX_SCHEDULE_TIMEOUT;
	int ret = 0;

	DEFINE_WAIT(wait_entry);

	op->upcall.tgid = current->tgid;
	op->upcall.pid = current->pid;

retry_servicing:
	op->downcall.status = 0;
	gossip_debug(GOSSIP_WAIT_DEBUG,
		     "%s: %s op:%p: process:%s: pid:%d:\n",
		     __func__,
		     op_name,
		     op,
		     current->comm,
		     current->pid);

	/*
	 * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid
	 * acquiring the request_mutex because we're servicing a
	 * high priority remount operation and the request_mutex is
	 * already taken.
	 */
	if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
		if (flags & ORANGEFS_OP_INTERRUPTIBLE)
			ret = mutex_lock_interruptible(&orangefs_request_mutex);
		else
			ret = mutex_lock_killable(&orangefs_request_mutex);
		/*
		 * check to see if we were interrupted while waiting for
		 * mutex
		 */
		if (ret < 0) {
			op->downcall.status = ret;
			gossip_debug(GOSSIP_WAIT_DEBUG,
				     "%s: service_operation interrupted.\n",
				     __func__);
			return ret;
		}
	}

	/* queue up the operation */
	spin_lock(&orangefs_request_list_lock);
	spin_lock(&op->lock);
	set_op_state_waiting(op);
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "%s: op:%s: op_state:%d: process:%s:\n",
		     __func__,
		     get_opname_string(op),
		     op->op_state,
		     current->comm);
	/* add high priority remount op to the front of the line. */
	if (flags & ORANGEFS_OP_PRIORITY)
		list_add(&op->list, &orangefs_request_list);
	else
		list_add_tail(&op->list, &orangefs_request_list);
	spin_unlock(&op->lock);
	wake_up_interruptible(&orangefs_request_list_waitq);
	if (!__is_daemon_in_service()) {
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "%s:client core is NOT in service.\n",
			     __func__);
		timeout = op_timeout_secs * HZ;
	}
	spin_unlock(&orangefs_request_list_lock);

	if (!(flags & ORANGEFS_OP_NO_MUTEX))
		mutex_unlock(&orangefs_request_mutex);

	ret = wait_for_matching_downcall(op, timeout,
					 flags & ORANGEFS_OP_INTERRUPTIBLE);

	gossip_debug(GOSSIP_WAIT_DEBUG,
		     "%s: wait_for_matching_downcall returned %d for %p\n",
		     __func__,
		     ret,
		     op);

	/* got matching downcall; make sure status is in errno format */
	if (!ret) {
		spin_unlock(&op->lock);
		op->downcall.status =
		    orangefs_normalize_to_errno(op->downcall.status);
		ret = op->downcall.status;
		goto out;
	}

	/* failed to get matching downcall */
	if (ret == -ETIMEDOUT) {
		gossip_err("%s: %s -- wait timed out; aborting attempt.\n",
			   __func__,
			   op_name);
	}

	/*
	 * remove a waiting op from the request list or
	 * remove an in-progress op from the in-progress list.
	 */
	orangefs_clean_up_interrupted_operation(op);

	op->downcall.status = ret;
	/* retry if operation has not been serviced and if requested */
	if (ret == -EAGAIN) {
		op->attempts++;
		timeout = op_timeout_secs * HZ;
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "orangefs: tag %llu (%s)"
			     " -- operation to be retried (%d attempt)\n",
			     llu(op->tag),
			     op_name,
			     op->attempts);

		/*
		 * io ops (ops that use the shared memory buffer) have
		 * to be returned to their caller for a retry. Other ops
		 * can just be recycled here.
		 */
		if (!op->uses_shared_memory)
			goto retry_servicing;
	}

out:
	gossip_debug(GOSSIP_WAIT_DEBUG,
		     "%s: %s returning: %d for %p.\n",
		     __func__,
		     op_name,
		     ret,
		     op);
	return ret;
}

/* This can get called on an I/O op if it had a bad service_operation. */
bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
{
	u64 tag = op->tag;
	if (!op_state_in_progress(op))
		return false;

	op->slot_to_free = op->upcall.req.io.buf_index;
	memset(&op->upcall, 0, sizeof(op->upcall));
	memset(&op->downcall, 0, sizeof(op->downcall));
	op->upcall.type = ORANGEFS_VFS_OP_CANCEL;
	op->upcall.req.cancel.op_tag = tag;
	op->downcall.type = ORANGEFS_VFS_OP_INVALID;
	op->downcall.status = -1;
	orangefs_new_tag(op);

	spin_lock(&orangefs_request_list_lock);
	/* orangefs_request_list_lock is enough of a barrier here */
	if (!__is_daemon_in_service()) {
		spin_unlock(&orangefs_request_list_lock);
		return false;
	}
	spin_lock(&op->lock);
	set_op_state_waiting(op);
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "%s: op:%s: op_state:%d: process:%s:\n",
		     __func__,
		     get_opname_string(op),
		     op->op_state,
		     current->comm);
	list_add(&op->list, &orangefs_request_list);
	spin_unlock(&op->lock);
	spin_unlock(&orangefs_request_list_lock);

	gossip_debug(GOSSIP_WAIT_DEBUG,
		     "Attempting ORANGEFS operation cancellation of tag %llu\n",
		     llu(tag));
	return true;
}

/*
 * Change an op to the "given up" state and remove it from its list.
 */
static void
	orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
{
	/*
	 * handle interrupted cases depending on what state we were in when
	 * the interruption is detected.
	 *
	 * Called with op->lock held.
	 */

	/*
	 * List manipulation code elsewhere will ignore ops that
	 * have been given up upon.
	 */
	op->op_state |= OP_VFS_STATE_GIVEN_UP;

	if (list_empty(&op->list)) {
		/* caught copying to/from daemon */
		BUG_ON(op_state_serviced(op));
		spin_unlock(&op->lock);
		wait_for_completion(&op->waitq);
	} else if (op_state_waiting(op)) {
		/*
		 * upcall hasn't been read; remove op from upcall request
		 * list.
		 */
		spin_unlock(&op->lock);
		spin_lock(&orangefs_request_list_lock);
		list_del_init(&op->list);
		spin_unlock(&orangefs_request_list_lock);
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "Interrupted: Removed op %p from request_list\n",
			     op);
	} else if (op_state_in_progress(op)) {
		/* op must be removed from the in progress htable */
		spin_unlock(&op->lock);
		spin_lock(&orangefs_htable_ops_in_progress_lock);
		list_del_init(&op->list);
		spin_unlock(&orangefs_htable_ops_in_progress_lock);
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "Interrupted: Removed op %p"
			     " from htable_ops_in_progress\n",
			     op);
	} else {
		spin_unlock(&op->lock);
		gossip_err("interrupted operation is in a weird state 0x%x\n",
			   op->op_state);
	}
	reinit_completion(&op->waitq);
}

/*
 * Sleeps on waitqueue waiting for matching downcall.
 * If client-core finishes servicing, then we are good to go.
 * else if client-core exits, we get woken up here, and retry with a timeout
 *
 * When this call returns to the caller, the specified op will no
 * longer be in either the in_progress hash table or on the request list.
 *
 * Returns 0 on success and -errno on failure
 * Errors are:
 * EAGAIN in case we want the caller to requeue and try again..
 * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
 * operation since client-core seems to be exiting too often
 * or if we were interrupted.
 *
 * Returns with op->lock taken.
 */
static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
				      long timeout,
				      bool interruptible)
{
	long n;

	/*
	 * There's a "schedule_timeout" inside of these wait
	 * primitives, during which the op is out of the hands of the
	 * user process that needs something done and is being
	 * manipulated by the client-core process.
	 */
	if (interruptible)
		n = wait_for_completion_interruptible_timeout(&op->waitq,
							      timeout);
	else
		n = wait_for_completion_killable_timeout(&op->waitq, timeout);

	spin_lock(&op->lock);

	if (op_state_serviced(op))
		return 0;

	if (unlikely(n < 0)) {
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "%s: operation interrupted, tag %llu, %p\n",
			     __func__,
			     llu(op->tag),
			     op);
		return -EINTR;
	}
	if (op_state_purged(op)) {
		gossip_debug(GOSSIP_WAIT_DEBUG,
			     "%s: operation purged, tag %llu, %p, %d\n",
			     __func__,
			     llu(op->tag),
			     op,
			     op->attempts);
		return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ?
			 -EAGAIN :
			 -EIO;
	}
	/* must have timed out, then... */
	gossip_debug(GOSSIP_WAIT_DEBUG,
		     "%s: operation timed out, tag %llu, %p, %d)\n",
		     __func__,
		     llu(op->tag),
		     op,
		     op->attempts);
	return -ETIMEDOUT;
}