1 |
commit 6424babfd68dd8a83d9c60a5242d27038856599f |
2 |
Author: Jerry Hoemann <jerry.hoemann@hp.com> |
3 |
Date: Wed Oct 29 14:49:57 2014 -0700 |
4 |
|
5 |
fsnotify: next_i is freed during fsnotify_unmount_inodes. |
6 |
|
7 |
During file system stress testing on 3.10 and 3.12 based kernels, the |
8 |
umount command occasionally hung in fsnotify_unmount_inodes in the |
9 |
section of code: |
10 |
|
11 |
spin_lock(&inode->i_lock); |
12 |
if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { |
13 |
spin_unlock(&inode->i_lock); |
14 |
continue; |
15 |
} |
16 |
|
17 |
As this section of code holds the global inode_sb_list_lock, eventually |
18 |
the system hangs trying to acquire the lock. |
19 |
|
20 |
Multiple crash dumps showed: |
21 |
|
22 |
The inode->i_state == 0x60 and i_count == 0 and i_sb_list would point |
23 |
back at itself. As this is not the value of list upon entry to the |
24 |
function, the kernel never exits the loop. |
25 |
|
26 |
To help narrow down problem, the call to list_del_init in |
27 |
inode_sb_list_del was changed to list_del. This poisons the pointers in |
28 |
the i_sb_list and causes a kernel to panic if it transverse a freed |
29 |
inode. |
30 |
|
31 |
Subsequent stress testing paniced in fsnotify_unmount_inodes at the |
32 |
bottom of the list_for_each_entry_safe loop showing next_i had become |
33 |
free. |
34 |
|
35 |
We believe the root cause of the problem is that next_i is being freed |
36 |
during the window of time that the list_for_each_entry_safe loop |
37 |
temporarily releases inode_sb_list_lock to call fsnotify and |
38 |
fsnotify_inode_delete. |
39 |
|
40 |
The code in fsnotify_unmount_inodes attempts to prevent the freeing of |
41 |
inode and next_i by calling __iget. However, the code doesn't do the |
42 |
__iget call on next_i |
43 |
|
44 |
if i_count == 0 or |
45 |
if i_state & (I_FREEING | I_WILL_FREE) |
46 |
|
47 |
The patch addresses this issue by advancing next_i in the above two cases |
48 |
until we either find a next_i which we can __iget or we reach the end of |
49 |
the list. This makes the handling of next_i more closely match the |
50 |
handling of the variable "inode." |
51 |
|
52 |
The time to reproduce the hang is highly variable (from hours to days.) We |
53 |
ran the stress test on a 3.10 kernel with the proposed patch for a week |
54 |
without failure. |
55 |
|
56 |
During list_for_each_entry_safe, next_i is becoming free causing |
57 |
the loop to never terminate. Advance next_i in those cases where |
58 |
__iget is not done. |
59 |
|
60 |
Signed-off-by: Jerry Hoemann <jerry.hoemann@hp.com> |
61 |
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com> |
62 |
Cc: Ken Helias <kenhelias@firemail.de> |
63 |
Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
64 |
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
65 |
--- |
66 |
fs/notify/inode_mark.c | 17 +++++++++++------ |
67 |
1 file changed, 11 insertions(+), 6 deletions(-) |
68 |
|
69 |
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c |
70 |
index 9ce0622..e849714 100644 |
71 |
--- a/fs/notify/inode_mark.c |
72 |
+++ b/fs/notify/inode_mark.c |
73 |
@@ -288,20 +288,25 @@ void fsnotify_unmount_inodes(struct list_head *list) |
74 |
spin_unlock(&inode->i_lock); |
75 |
|
76 |
/* In case the dropping of a reference would nuke next_i. */ |
77 |
- if ((&next_i->i_sb_list != list) && |
78 |
- atomic_read(&next_i->i_count)) { |
79 |
+ while (&next_i->i_sb_list != list) { |
80 |
spin_lock(&next_i->i_lock); |
81 |
- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { |
82 |
+ if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && |
83 |
+ atomic_read(&next_i->i_count)) { |
84 |
__iget(next_i); |
85 |
need_iput = next_i; |
86 |
+ spin_unlock(&next_i->i_lock); |
87 |
+ break; |
88 |
} |
89 |
spin_unlock(&next_i->i_lock); |
90 |
+ next_i = list_entry(next_i->i_sb_list.next, |
91 |
+ struct inode, i_sb_list); |
92 |
} |
93 |
|
94 |
/* |
95 |
- * We can safely drop inode_sb_list_lock here because we hold |
96 |
- * references on both inode and next_i. Also no new inodes |
97 |
- * will be added since the umount has begun. |
98 |
+ * We can safely drop inode_sb_list_lock here because either |
99 |
+ * we actually hold references on both inode and next_i or |
100 |
+ * end of list. Also no new inodes will be added since the |
101 |
+ * umount has begun. |
102 |
*/ |
103 |
spin_unlock(&inode_sb_list_lock); |
104 |
|