1 |
From 9449ad33be8480f538b11a593e2dda2fb33ca06d Mon Sep 17 00:00:00 2001 |
2 |
From: Junxiao Bi <junxiao.bi@oracle.com> |
3 |
Date: Thu, 29 Jul 2021 14:53:41 -0700 |
4 |
Subject: ocfs2: issue zeroout to EOF blocks |
5 |
|
6 |
From: Junxiao Bi <junxiao.bi@oracle.com> |
7 |
|
8 |
commit 9449ad33be8480f538b11a593e2dda2fb33ca06d upstream. |
9 |
|
10 |
For punch holes in EOF blocks, fallocate used buffer write to zero the |
11 |
EOF blocks in last cluster. But since ->writepage will ignore EOF |
12 |
pages, those zeros will not be flushed. |
13 |
|
14 |
This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by |
15 |
fallocate") will zero the EOF blocks when extend the file size, but it |
16 |
isn't. The problem happened on those EOF pages, before writeback, those |
17 |
pages had DIRTY flag set and all buffer_head in them also had DIRTY flag |
18 |
set, when writeback run by write_cache_pages(), DIRTY flag on the page |
19 |
was cleared, but DIRTY flag on the buffer_head not. |
20 |
|
21 |
When next write happened to those EOF pages, since buffer_head already |
22 |
had DIRTY flag set, it would not mark page DIRTY again. That made |
23 |
writeback ignore them forever. That will cause data corruption. Even |
24 |
directio write can't work because it will fail when trying to drop pages |
25 |
caches before direct io, as it found the buffer_head for those pages |
26 |
still had DIRTY flag set, then it will fall back to buffer io mode. |
27 |
|
28 |
To make a summary of the issue, as writeback ingores EOF pages, once any |
29 |
EOF page is generated, any write to it will only go to the page cache, |
30 |
it will never be flushed to disk even file size extends and that page is |
31 |
not EOF page any more. The fix is to avoid zero EOF blocks with buffer |
32 |
write. |
33 |
|
34 |
The following code snippet from qemu-img could trigger the corruption. |
35 |
|
36 |
656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11 |
37 |
... |
38 |
660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...> |
39 |
660 fallocate(11, 0, 2275868672, 327680) = 0 |
40 |
658 pwrite64(11, " |
41 |
|
42 |
Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com |
43 |
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> |
44 |
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> |
45 |
Cc: Mark Fasheh <mark@fasheh.com> |
46 |
Cc: Joel Becker <jlbec@evilplan.org> |
47 |
Cc: Changwei Ge <gechangwei@live.cn> |
48 |
Cc: Gang He <ghe@suse.com> |
49 |
Cc: Jun Piao <piaojun@huawei.com> |
50 |
Cc: <stable@vger.kernel.org> |
51 |
Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
52 |
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
53 |
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
54 |
--- |
55 |
fs/ocfs2/file.c | 99 +++++++++++++++++++++++++++++++++----------------------- |
56 |
1 file changed, 60 insertions(+), 39 deletions(-) |
57 |
|
58 |
--- a/fs/ocfs2/file.c |
59 |
+++ b/fs/ocfs2/file.c |
60 |
@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages |
61 |
} |
62 |
} |
63 |
|
64 |
+/* |
65 |
+ * zero out partial blocks of one cluster. |
66 |
+ * |
67 |
+ * start: file offset where zero starts, will be made upper block aligned. |
68 |
+ * len: it will be trimmed to the end of current cluster if "start + len" |
69 |
+ * is bigger than it. |
70 |
+ */ |
71 |
+static int ocfs2_zeroout_partial_cluster(struct inode *inode, |
72 |
+ u64 start, u64 len) |
73 |
+{ |
74 |
+ int ret; |
75 |
+ u64 start_block, end_block, nr_blocks; |
76 |
+ u64 p_block, offset; |
77 |
+ u32 cluster, p_cluster, nr_clusters; |
78 |
+ struct super_block *sb = inode->i_sb; |
79 |
+ u64 end = ocfs2_align_bytes_to_clusters(sb, start); |
80 |
+ |
81 |
+ if (start + len < end) |
82 |
+ end = start + len; |
83 |
+ |
84 |
+ start_block = ocfs2_blocks_for_bytes(sb, start); |
85 |
+ end_block = ocfs2_blocks_for_bytes(sb, end); |
86 |
+ nr_blocks = end_block - start_block; |
87 |
+ if (!nr_blocks) |
88 |
+ return 0; |
89 |
+ |
90 |
+ cluster = ocfs2_bytes_to_clusters(sb, start); |
91 |
+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster, |
92 |
+ &nr_clusters, NULL); |
93 |
+ if (ret) |
94 |
+ return ret; |
95 |
+ if (!p_cluster) |
96 |
+ return 0; |
97 |
+ |
98 |
+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); |
99 |
+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; |
100 |
+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); |
101 |
+} |
102 |
+ |
103 |
static int ocfs2_zero_partial_clusters(struct inode *inode, |
104 |
u64 start, u64 len) |
105 |
{ |
106 |
@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(s |
107 |
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
108 |
unsigned int csize = osb->s_clustersize; |
109 |
handle_t *handle; |
110 |
+ loff_t isize = i_size_read(inode); |
111 |
|
112 |
/* |
113 |
* The "start" and "end" values are NOT necessarily part of |
114 |
@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(s |
115 |
if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) |
116 |
goto out; |
117 |
|
118 |
+ /* No page cache for EOF blocks, issue zero out to disk. */ |
119 |
+ if (end > isize) { |
120 |
+ /* |
121 |
+ * zeroout eof blocks in last cluster starting from |
122 |
+ * "isize" even "start" > "isize" because it is |
123 |
+ * complicated to zeroout just at "start" as "start" |
124 |
+ * may be not aligned with block size, buffer write |
125 |
+ * would be required to do that, but out of eof buffer |
126 |
+ * write is not supported. |
127 |
+ */ |
128 |
+ ret = ocfs2_zeroout_partial_cluster(inode, isize, |
129 |
+ end - isize); |
130 |
+ if (ret) { |
131 |
+ mlog_errno(ret); |
132 |
+ goto out; |
133 |
+ } |
134 |
+ if (start >= isize) |
135 |
+ goto out; |
136 |
+ end = isize; |
137 |
+ } |
138 |
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
139 |
if (IS_ERR(handle)) { |
140 |
ret = PTR_ERR(handle); |
141 |
@@ -1856,45 +1916,6 @@ out: |
142 |
} |
143 |
|
144 |
/* |
145 |
- * zero out partial blocks of one cluster. |
146 |
- * |
147 |
- * start: file offset where zero starts, will be made upper block aligned. |
148 |
- * len: it will be trimmed to the end of current cluster if "start + len" |
149 |
- * is bigger than it. |
150 |
- */ |
151 |
-static int ocfs2_zeroout_partial_cluster(struct inode *inode, |
152 |
- u64 start, u64 len) |
153 |
-{ |
154 |
- int ret; |
155 |
- u64 start_block, end_block, nr_blocks; |
156 |
- u64 p_block, offset; |
157 |
- u32 cluster, p_cluster, nr_clusters; |
158 |
- struct super_block *sb = inode->i_sb; |
159 |
- u64 end = ocfs2_align_bytes_to_clusters(sb, start); |
160 |
- |
161 |
- if (start + len < end) |
162 |
- end = start + len; |
163 |
- |
164 |
- start_block = ocfs2_blocks_for_bytes(sb, start); |
165 |
- end_block = ocfs2_blocks_for_bytes(sb, end); |
166 |
- nr_blocks = end_block - start_block; |
167 |
- if (!nr_blocks) |
168 |
- return 0; |
169 |
- |
170 |
- cluster = ocfs2_bytes_to_clusters(sb, start); |
171 |
- ret = ocfs2_get_clusters(inode, cluster, &p_cluster, |
172 |
- &nr_clusters, NULL); |
173 |
- if (ret) |
174 |
- return ret; |
175 |
- if (!p_cluster) |
176 |
- return 0; |
177 |
- |
178 |
- offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); |
179 |
- p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; |
180 |
- return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); |
181 |
-} |
182 |
- |
183 |
-/* |
184 |
* Parts of this function taken from xfs_change_file_space() |
185 |
*/ |
186 |
static int __ocfs2_change_file_space(struct file *file, struct inode *inode, |