Package: zfs-linux / 0.7.12-2+deb10u2

01937958ce85b1cd8942dbaf9a3f9768c5b02a0a.patch Patch series | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
From 01937958ce85b1cd8942dbaf9a3f9768c5b02a0a Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Thu, 31 May 2018 10:29:12 -0700
Subject: [PATCH] OpenZFS 9577 - remove zfs_dbuf_evict_key tsd

The zfs_dbuf_evict_key TSD (thread-specific data) is not necessary -
we can instead pass a flag down in a few places to prevent recursive
dbuf eviction. Making this change has 3 benefits:

1. The code semantics are easier to understand.
2. On Linux, performance is improved, because creating/removing
   TSD values (by setting to NULL vs non-NULL) is expensive, and
   we do it very often.
3. According to Nexenta, the current semantics can cause a
   deadlock when concurrently calling dmu_objset_evict_dbufs()
   (which is rare today, but they are working on a "parallel
   unmount" change that triggers this more easily):

Porting Notes:
* Minor conflict with OpenZFS 9337 which has not yet been ported.

Authored by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>

OpenZFS-issue: https://illumos.org/issues/9577
OpenZFS-commit: https://github.com/openzfs/openzfs/pull/645
External-issue: DLPX-58547
Closes #7602
---
 include/sys/dbuf.h      |  4 +--
 include/sys/dnode.h     |  4 +--
 module/zfs/dbuf.c       | 69 ++++++++++++++---------------------------
 module/zfs/dnode.c      |  7 +++--
 module/zfs/dnode_sync.c | 17 ++++++++--
 5 files changed, 46 insertions(+), 55 deletions(-)

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 127acad33c7..e007e97644e 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
@@ -294,7 +294,7 @@ boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
-void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid);
diff --git a/include/sys/dnode.h b/include/sys/dnode.h
index 1e77e0a32ec..4e006df5535 100644
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -339,7 +339,7 @@ int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
-void dnode_rele_and_unlock(dnode_t *dn, void *tag);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 62b77bb0a1d..9694ce78b19 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -72,8 +72,6 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
 	void *tag, dmu_buf_impl_t **dbp, int depth);
 static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
 
-uint_t zfs_dbuf_evict_key;
-
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
@@ -505,14 +503,6 @@ dbuf_evict_one(void)
 	dmu_buf_impl_t *db;
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
-	/*
-	 * Set the thread's tsd to indicate that it's processing evictions.
-	 * Once a thread stops evicting from the dbuf cache it will
-	 * reset its tsd to NULL.
-	 */
-	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
-	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
-
 	db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
@@ -530,7 +520,6 @@ dbuf_evict_one(void)
 	} else {
 		multilist_sublist_unlock(mls);
 	}
-	(void) tsd_set(zfs_dbuf_evict_key, NULL);
 }
 
 /*
@@ -583,29 +572,6 @@ dbuf_evict_thread(void)
 static void
 dbuf_evict_notify(void)
 {
-
-	/*
-	 * We use thread specific data to track when a thread has
-	 * started processing evictions. This allows us to avoid deeply
-	 * nested stacks that would have a call flow similar to this:
-	 *
-	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
-	 *	^						|
-	 *	|						|
-	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
-	 *
-	 * The dbuf_eviction_thread will always have its tsd set until
-	 * that thread exits. All other threads will only set their tsd
-	 * if they are participating in the eviction process. This only
-	 * happens if the eviction thread is unable to process evictions
-	 * fast enough. To keep the dbuf cache size in check, other threads
-	 * can evict from the dbuf cache directly. Those threads will set
-	 * their tsd values so that we ensure that they only evict one dbuf
-	 * from the dbuf cache.
-	 */
-	if (tsd_get(zfs_dbuf_evict_key) != NULL)
-		return;
-
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
@@ -681,7 +647,6 @@ dbuf_init(void)
 	    dbuf_cache_multilist_index_func);
 	zfs_refcount_create(&dbuf_cache_size);
 
-	tsd_create(&zfs_dbuf_evict_key, NULL);
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
@@ -718,7 +683,6 @@ dbuf_fini(void)
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
-	tsd_destroy(&zfs_dbuf_evict_key);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
@@ -1004,7 +968,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
-	dbuf_rele_and_unlock(db, NULL);
+	dbuf_rele_and_unlock(db, NULL, B_FALSE);
 }
 
 static int
@@ -2178,7 +2142,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
-		dnode_rele(dn, db);
+		mutex_enter(&dn->dn_mtx);
+		dnode_rele_and_unlock(dn, db, B_TRUE);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
@@ -2204,8 +2169,10 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (parent && parent != dndb)
-		dbuf_rele(parent, db);
+	if (parent && parent != dndb) {
+		mutex_enter(&parent->db_mtx);
+		dbuf_rele_and_unlock(parent, db, B_TRUE);
+	}
 }
 
 /*
@@ -2912,7 +2879,7 @@ void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
-	dbuf_rele_and_unlock(db, tag);
+	dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
@@ -2923,10 +2890,19 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
- * db_dirtycnt and db_holds to be updated atomically.
+ * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict.  This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ *	^						|
+ *	|						|
+ *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
 {
 	int64_t holds;
 
@@ -3021,7 +2997,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
 
-				dbuf_evict_notify();
+				if (!evicting)
+					dbuf_evict_notify();
 			}
 
 			if (do_arc_evict)
@@ -3314,7 +3291,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 		return;
 	}
 
@@ -3670,7 +3647,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
-	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 }
 
 static void
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 989a8ec7f69..b80aeb8b1f8 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1533,11 +1533,11 @@ void
 dnode_rele(dnode_t *dn, void *tag)
 {
 	mutex_enter(&dn->dn_mtx);
-	dnode_rele_and_unlock(dn, tag);
+	dnode_rele_and_unlock(dn, tag, B_FALSE);
 }
 
 void
-dnode_rele_and_unlock(dnode_t *dn, void *tag)
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
 {
 	uint64_t refs;
 	/* Get while the hold prevents the dnode from moving. */
@@ -1568,7 +1568,8 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag)
 		 * that the handle has zero references, but that will be
 		 * asserted anyway when the handle gets destroyed.
 		 */
-		dbuf_rele(db, dnh);
+		mutex_enter(&db->db_mtx);
+		dbuf_rele_and_unlock(db, dnh, evicting);
 	}
 }
 
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index 2febb520630..ee86c13b1b2 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -429,6 +429,19 @@ dnode_evict_dbufs(dnode_t *dn)
 			avl_insert_here(&dn->dn_dbufs, db_marker, db,
 			    AVL_BEFORE);
 
+			/*
+			 * We need to use the "marker" dbuf rather than
+			 * simply getting the next dbuf, because
+			 * dbuf_destroy() may actually remove multiple dbufs.
+			 * It can call itself recursively on the parent dbuf,
+			 * which may also be removed from dn_dbufs.  The code
+			 * flow would look like:
+			 *
+			 * dbuf_destroy():
+			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+			 *	if (!cacheable || pending_evict)
+			 *	  dbuf_destroy()
+			 */
 			dbuf_destroy(db);
 
 			db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
@@ -489,7 +502,7 @@ dnode_undirty_dbufs(list_t *list)
 			list_destroy(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 	}
 }