current/SOURCES/ocfs2-issue-zeroout-to-eof-blocks.patch

From 9449ad33be8480f538b11a593e2dda2fb33ca06d Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Thu, 29 Jul 2021 14:53:41 -0700
Subject: ocfs2: issue zeroout to EOF blocks

From: Junxiao Bi <junxiao.bi@oracle.com>

commit 9449ad33be8480f538b11a593e2dda2fb33ca06d upstream.

For punch holes in EOF blocks, fallocate used buffer write to zero the
EOF blocks in last cluster.  But since ->writepage will ignore EOF
pages, those zeros will not be flushed.

This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by
fallocate") will zero the EOF blocks when extend the file size, but it
isn't.  The problem happened on those EOF pages, before writeback, those
pages had DIRTY flag set and all buffer_head in them also had DIRTY flag
set, when writeback run by write_cache_pages(), DIRTY flag on the page
was cleared, but DIRTY flag on the buffer_head not.

When next write happened to those EOF pages, since buffer_head already
had DIRTY flag set, it would not mark page DIRTY again.  That made
writeback ignore them forever.  That will cause data corruption.  Even
directio write can't work because it will fail when trying to drop pages
caches before direct io, as it found the buffer_head for those pages
still had DIRTY flag set, then it will fall back to buffer io mode.

To make a summary of the issue, as writeback ingores EOF pages, once any
EOF page is generated, any write to it will only go to the page cache,
it will never be flushed to disk even file size extends and that page is
not EOF page any more.  The fix is to avoid zero EOF blocks with buffer
write.

The following code snippet from qemu-img could trigger the corruption.

  656   open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11
  ...
  660   fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
  660   fallocate(11, 0, 2275868672, 327680) = 0
  658   pwrite64(11, "

Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Gang He <ghe@suse.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ocfs2/file.c |   99 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 39 deletions(-)

--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages
        }
 }
 
+/*
+ * zero out partial blocks of one cluster.
+ *
+ * start: file offset where zero starts, will be made upper block aligned.
+ * len: it will be trimmed to the end of current cluster if "start + len"
+ *      is bigger than it.
+ */
+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
+                                       u64 start, u64 len)
+{
+       int ret;
+       u64 start_block, end_block, nr_blocks;
+       u64 p_block, offset;
+       u32 cluster, p_cluster, nr_clusters;
+       struct super_block *sb = inode->i_sb;
+       u64 end = ocfs2_align_bytes_to_clusters(sb, start);
+
+       if (start + len < end)
+               end = start + len;
+
+       start_block = ocfs2_blocks_for_bytes(sb, start);
+       end_block = ocfs2_blocks_for_bytes(sb, end);
+       nr_blocks = end_block - start_block;
+       if (!nr_blocks)
+               return 0;
+
+       cluster = ocfs2_bytes_to_clusters(sb, start);
+       ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
+                               &nr_clusters, NULL);
+       if (ret)
+               return ret;
+       if (!p_cluster)
+               return 0;
+
+       offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
+       p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
+       return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
+}
+
 static int ocfs2_zero_partial_clusters(struct inode *inode,
                                       u64 start, u64 len)
 {
@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(s
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        unsigned int csize = osb->s_clustersize;
        handle_t *handle;
+       loff_t isize = i_size_read(inode);
 
        /*
         * The "start" and "end" values are NOT necessarily part of
@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(s
        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
                goto out;
 
+       /* No page cache for EOF blocks, issue zero out to disk. */
+       if (end > isize) {
+               /*
+                * zeroout eof blocks in last cluster starting from
+                * "isize" even "start" > "isize" because it is
+                * complicated to zeroout just at "start" as "start"
+                * may be not aligned with block size, buffer write
+                * would be required to do that, but out of eof buffer
+                * write is not supported.
+                */
+               ret = ocfs2_zeroout_partial_cluster(inode, isize,
+                                       end - isize);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+               if (start >= isize)
+                       goto out;
+               end = isize;
+       }
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -1856,45 +1916,6 @@ out:
 }
 
 /*
- * zero out partial blocks of one cluster.
- *
- * start: file offset where zero starts, will be made upper block aligned.
- * len: it will be trimmed to the end of current cluster if "start + len"
- *      is bigger than it.
- */
-static int ocfs2_zeroout_partial_cluster(struct inode *inode,
-                                       u64 start, u64 len)
-{
-       int ret;
-       u64 start_block, end_block, nr_blocks;
-       u64 p_block, offset;
-       u32 cluster, p_cluster, nr_clusters;
-       struct super_block *sb = inode->i_sb;
-       u64 end = ocfs2_align_bytes_to_clusters(sb, start);
-
-       if (start + len < end)
-               end = start + len;
-
-       start_block = ocfs2_blocks_for_bytes(sb, start);
-       end_block = ocfs2_blocks_for_bytes(sb, end);
-       nr_blocks = end_block - start_block;
-       if (!nr_blocks)
-               return 0;
-
-       cluster = ocfs2_bytes_to_clusters(sb, start);
-       ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
-                               &nr_clusters, NULL);
-       if (ret)
-               return ret;
-       if (!p_cluster)
-               return 0;
-
-       offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
-       p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
-       return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
-}
-
-/*
  * Parts of this function taken from xfs_change_file_space()
  */
 static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1	From 9449ad33be8480f538b11a593e2dda2fb33ca06d Mon Sep 17 00:00:00 2001
2	From: Junxiao Bi <junxiao.bi@oracle.com>
3	Date: Thu, 29 Jul 2021 14:53:41 -0700
4	Subject: ocfs2: issue zeroout to EOF blocks
5
6	From: Junxiao Bi <junxiao.bi@oracle.com>
7
8	commit 9449ad33be8480f538b11a593e2dda2fb33ca06d upstream.
9
10	For punch holes in EOF blocks, fallocate used buffer write to zero the
11	EOF blocks in last cluster. But since ->writepage will ignore EOF
12	pages, those zeros will not be flushed.
13
14	This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by
15	fallocate") will zero the EOF blocks when extend the file size, but it
16	isn't. The problem happened on those EOF pages, before writeback, those
17	pages had DIRTY flag set and all buffer_head in them also had DIRTY flag
18	set, when writeback run by write_cache_pages(), DIRTY flag on the page
19	was cleared, but DIRTY flag on the buffer_head not.
20
21	When next write happened to those EOF pages, since buffer_head already
22	had DIRTY flag set, it would not mark page DIRTY again. That made
23	writeback ignore them forever. That will cause data corruption. Even
24	directio write can't work because it will fail when trying to drop pages
25	caches before direct io, as it found the buffer_head for those pages
26	still had DIRTY flag set, then it will fall back to buffer io mode.
27
28	To make a summary of the issue, as writeback ingores EOF pages, once any
29	EOF page is generated, any write to it will only go to the page cache,
30	it will never be flushed to disk even file size extends and that page is
31	not EOF page any more. The fix is to avoid zero EOF blocks with buffer
32	write.
33
34	The following code snippet from qemu-img could trigger the corruption.
35
36	656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR\|O_DIRECT\|O_CLOEXEC) = 11
37	...
38	660 fallocate(11, FALLOC_FL_KEEP_SIZE\|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...>
39	660 fallocate(11, 0, 2275868672, 327680) = 0
40	658 pwrite64(11, "
41
42	Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com
43	Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
44	Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
45	Cc: Mark Fasheh <mark@fasheh.com>
46	Cc: Joel Becker <jlbec@evilplan.org>
47	Cc: Changwei Ge <gechangwei@live.cn>
48	Cc: Gang He <ghe@suse.com>
49	Cc: Jun Piao <piaojun@huawei.com>
50	Cc: <stable@vger.kernel.org>
51	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
52	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
53	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
54	---
55	fs/ocfs2/file.c \| 99 +++++++++++++++++++++++++++++++++-----------------------
56	1 file changed, 60 insertions(+), 39 deletions(-)
57
58	--- a/fs/ocfs2/file.c
59	+++ b/fs/ocfs2/file.c
60	@@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages
61	}
62	}
63
64	+/*
65	+ * zero out partial blocks of one cluster.
66	+ *
67	+ * start: file offset where zero starts, will be made upper block aligned.
68	+ * len: it will be trimmed to the end of current cluster if "start + len"
69	+ * is bigger than it.
70	+ */
71	+static int ocfs2_zeroout_partial_cluster(struct inode *inode,
72	+ u64 start, u64 len)
73	+{
74	+ int ret;
75	+ u64 start_block, end_block, nr_blocks;
76	+ u64 p_block, offset;
77	+ u32 cluster, p_cluster, nr_clusters;
78	+ struct super_block *sb = inode->i_sb;
79	+ u64 end = ocfs2_align_bytes_to_clusters(sb, start);
80	+
81	+ if (start + len < end)
82	+ end = start + len;
83	+
84	+ start_block = ocfs2_blocks_for_bytes(sb, start);
85	+ end_block = ocfs2_blocks_for_bytes(sb, end);
86	+ nr_blocks = end_block - start_block;
87	+ if (!nr_blocks)
88	+ return 0;
89	+
90	+ cluster = ocfs2_bytes_to_clusters(sb, start);
91	+ ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
92	+ &nr_clusters, NULL);
93	+ if (ret)
94	+ return ret;
95	+ if (!p_cluster)
96	+ return 0;
97	+
98	+ offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
99	+ p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
100	+ return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
101	+}
102	+
103	static int ocfs2_zero_partial_clusters(struct inode *inode,
104	u64 start, u64 len)
105	{
106	@@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(s
107	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
108	unsigned int csize = osb->s_clustersize;
109	handle_t *handle;
110	+ loff_t isize = i_size_read(inode);
111
112	/*
113	* The "start" and "end" values are NOT necessarily part of
114	@@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(s
115	if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
116	goto out;
117
118	+ /* No page cache for EOF blocks, issue zero out to disk. */
119	+ if (end > isize) {
120	+ /*
121	+ * zeroout eof blocks in last cluster starting from
122	+ * "isize" even "start" > "isize" because it is
123	+ * complicated to zeroout just at "start" as "start"
124	+ * may be not aligned with block size, buffer write
125	+ * would be required to do that, but out of eof buffer
126	+ * write is not supported.
127	+ */
128	+ ret = ocfs2_zeroout_partial_cluster(inode, isize,
129	+ end - isize);
130	+ if (ret) {
131	+ mlog_errno(ret);
132	+ goto out;
133	+ }
134	+ if (start >= isize)
135	+ goto out;
136	+ end = isize;
137	+ }
138	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
139	if (IS_ERR(handle)) {
140	ret = PTR_ERR(handle);
141	@@ -1856,45 +1916,6 @@ out:
142	}
143
144	/*
145	- * zero out partial blocks of one cluster.
146	- *
147	- * start: file offset where zero starts, will be made upper block aligned.
148	- * len: it will be trimmed to the end of current cluster if "start + len"
149	- * is bigger than it.
150	- */
151	-static int ocfs2_zeroout_partial_cluster(struct inode *inode,
152	- u64 start, u64 len)
153	-{
154	- int ret;
155	- u64 start_block, end_block, nr_blocks;
156	- u64 p_block, offset;
157	- u32 cluster, p_cluster, nr_clusters;
158	- struct super_block *sb = inode->i_sb;
159	- u64 end = ocfs2_align_bytes_to_clusters(sb, start);
160	-
161	- if (start + len < end)
162	- end = start + len;
163	-
164	- start_block = ocfs2_blocks_for_bytes(sb, start);
165	- end_block = ocfs2_blocks_for_bytes(sb, end);
166	- nr_blocks = end_block - start_block;
167	- if (!nr_blocks)
168	- return 0;
169	-
170	- cluster = ocfs2_bytes_to_clusters(sb, start);
171	- ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
172	- &nr_clusters, NULL);
173	- if (ret)
174	- return ret;
175	- if (!p_cluster)
176	- return 0;
177	-
178	- offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
179	- p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
180	- return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
181	-}
182	-
183	-/*
184	* Parts of this function taken from xfs_change_file_space()
185	*/
186	static int __ocfs2_change_file_space(struct file file, struct inode inode,