Line data Source code
1 : /*
2 : * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 : *
4 : * This program is free software; you can redistribute it and/or
5 : * modify it under the terms of the GNU General Public
6 : * License v2 as published by the Free Software Foundation.
7 : *
8 : * This program is distributed in the hope that it will be useful,
9 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 : * General Public License for more details.
12 : *
13 : * You should have received a copy of the GNU General Public
14 : * License along with this program; if not, write to the
15 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 : * Boston, MA 021110-1307, USA.
17 : */
18 :
19 : #include <linux/blkdev.h>
20 : #include <linux/ratelimit.h>
21 : #include "ctree.h"
22 : #include "volumes.h"
23 : #include "disk-io.h"
24 : #include "ordered-data.h"
25 : #include "transaction.h"
26 : #include "backref.h"
27 : #include "extent_io.h"
28 : #include "dev-replace.h"
29 : #include "check-integrity.h"
30 : #include "rcu-string.h"
31 : #include "raid56.h"
32 :
33 : /*
34 : * This is only the first step towards a full-features scrub. It reads all
35 : * extent and super block and verifies the checksums. In case a bad checksum
36 : * is found or the extent cannot be read, good data will be written back if
37 : * any can be found.
38 : *
39 : * Future enhancements:
40 : * - In case an unrepairable extent is encountered, track which files are
41 : * affected and report them
42 : * - track and record media errors, throw out bad devices
43 : * - add a mode to also read unallocated space
44 : */
45 :
46 : struct scrub_block;
47 : struct scrub_ctx;
48 :
49 : /*
50 : * the following three values only influence the performance.
51 : * The last one configures the number of parallel and outstanding I/O
52 : * operations. The first two values configure an upper limit for the number
53 : * of (dynamically allocated) pages that are added to a bio.
54 : */
55 : #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
56 : #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
57 : #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
58 :
59 : /*
60 : * the following value times PAGE_SIZE needs to be large enough to match the
61 : * largest node/leaf/sector size that shall be supported.
62 : * Values larger than BTRFS_STRIPE_LEN are not supported.
63 : */
64 : #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
65 :
66 : struct scrub_page {
67 : struct scrub_block *sblock;
68 : struct page *page;
69 : struct btrfs_device *dev;
70 : u64 flags; /* extent flags */
71 : u64 generation;
72 : u64 logical;
73 : u64 physical;
74 : u64 physical_for_dev_replace;
75 : atomic_t ref_count;
76 : struct {
77 : unsigned int mirror_num:8;
78 : unsigned int have_csum:1;
79 : unsigned int io_error:1;
80 : };
81 : u8 csum[BTRFS_CSUM_SIZE];
82 : };
83 :
84 : struct scrub_bio {
85 : int index;
86 : struct scrub_ctx *sctx;
87 : struct btrfs_device *dev;
88 : struct bio *bio;
89 : int err;
90 : u64 logical;
91 : u64 physical;
92 : #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93 : struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
94 : #else
95 : struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
96 : #endif
97 : int page_count;
98 : int next_free;
99 : struct btrfs_work work;
100 : };
101 :
102 : struct scrub_block {
103 : struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104 : int page_count;
105 : atomic_t outstanding_pages;
106 : atomic_t ref_count; /* free mem on transition to zero */
107 : struct scrub_ctx *sctx;
108 : struct {
109 : unsigned int header_error:1;
110 : unsigned int checksum_error:1;
111 : unsigned int no_io_error_seen:1;
112 : unsigned int generation_error:1; /* also sets header_error */
113 : };
114 : };
115 :
116 : struct scrub_wr_ctx {
117 : struct scrub_bio *wr_curr_bio;
118 : struct btrfs_device *tgtdev;
119 : int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
120 : atomic_t flush_all_writes;
121 : struct mutex wr_lock;
122 : };
123 :
124 : struct scrub_ctx {
125 : struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
126 : struct btrfs_root *dev_root;
127 : int first_free;
128 : int curr;
129 : atomic_t bios_in_flight;
130 : atomic_t workers_pending;
131 : spinlock_t list_lock;
132 : wait_queue_head_t list_wait;
133 : u16 csum_size;
134 : struct list_head csum_list;
135 : atomic_t cancel_req;
136 : int readonly;
137 : int pages_per_rd_bio;
138 : u32 sectorsize;
139 : u32 nodesize;
140 : u32 leafsize;
141 :
142 : int is_dev_replace;
143 : struct scrub_wr_ctx wr_ctx;
144 :
145 : /*
146 : * statistics
147 : */
148 : struct btrfs_scrub_progress stat;
149 : spinlock_t stat_lock;
150 : };
151 :
152 : struct scrub_fixup_nodatasum {
153 : struct scrub_ctx *sctx;
154 : struct btrfs_device *dev;
155 : u64 logical;
156 : struct btrfs_root *root;
157 : struct btrfs_work work;
158 : int mirror_num;
159 : };
160 :
161 : struct scrub_nocow_inode {
162 : u64 inum;
163 : u64 offset;
164 : u64 root;
165 : struct list_head list;
166 : };
167 :
168 : struct scrub_copy_nocow_ctx {
169 : struct scrub_ctx *sctx;
170 : u64 logical;
171 : u64 len;
172 : int mirror_num;
173 : u64 physical_for_dev_replace;
174 : struct list_head inodes;
175 : struct btrfs_work work;
176 : };
177 :
178 : struct scrub_warning {
179 : struct btrfs_path *path;
180 : u64 extent_item_size;
181 : char *scratch_buf;
182 : char *msg_buf;
183 : const char *errstr;
184 : sector_t sector;
185 : u64 logical;
186 : struct btrfs_device *dev;
187 : int msg_bufsize;
188 : int scratch_bufsize;
189 : };
190 :
191 :
192 : static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193 : static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194 : static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
195 : static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
196 : static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
197 : static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
198 : struct btrfs_fs_info *fs_info,
199 : struct scrub_block *original_sblock,
200 : u64 length, u64 logical,
201 : struct scrub_block *sblocks_for_recheck);
202 : static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
203 : struct scrub_block *sblock, int is_metadata,
204 : int have_csum, u8 *csum, u64 generation,
205 : u16 csum_size);
206 : static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
207 : struct scrub_block *sblock,
208 : int is_metadata, int have_csum,
209 : const u8 *csum, u64 generation,
210 : u16 csum_size);
211 : static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212 : struct scrub_block *sblock_good,
213 : int force_write);
214 : static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
215 : struct scrub_block *sblock_good,
216 : int page_num, int force_write);
217 : static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
218 : static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
219 : int page_num);
220 : static int scrub_checksum_data(struct scrub_block *sblock);
221 : static int scrub_checksum_tree_block(struct scrub_block *sblock);
222 : static int scrub_checksum_super(struct scrub_block *sblock);
223 : static void scrub_block_get(struct scrub_block *sblock);
224 : static void scrub_block_put(struct scrub_block *sblock);
225 : static void scrub_page_get(struct scrub_page *spage);
226 : static void scrub_page_put(struct scrub_page *spage);
227 : static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
228 : struct scrub_page *spage);
229 : static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
230 : u64 physical, struct btrfs_device *dev, u64 flags,
231 : u64 gen, int mirror_num, u8 *csum, int force,
232 : u64 physical_for_dev_replace);
233 : static void scrub_bio_end_io(struct bio *bio, int err);
234 : static void scrub_bio_end_io_worker(struct btrfs_work *work);
235 : static void scrub_block_complete(struct scrub_block *sblock);
236 : static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
237 : u64 extent_logical, u64 extent_len,
238 : u64 *extent_physical,
239 : struct btrfs_device **extent_dev,
240 : int *extent_mirror_num);
241 : static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
242 : struct scrub_wr_ctx *wr_ctx,
243 : struct btrfs_fs_info *fs_info,
244 : struct btrfs_device *dev,
245 : int is_dev_replace);
246 : static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
247 : static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
248 : struct scrub_page *spage);
249 : static void scrub_wr_submit(struct scrub_ctx *sctx);
250 : static void scrub_wr_bio_end_io(struct bio *bio, int err);
251 : static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
252 : static int write_page_nocow(struct scrub_ctx *sctx,
253 : u64 physical_for_dev_replace, struct page *page);
254 : static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
255 : struct scrub_copy_nocow_ctx *ctx);
256 : static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
257 : int mirror_num, u64 physical_for_dev_replace);
258 : static void copy_nocow_pages_worker(struct btrfs_work *work);
259 : static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
260 : static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
261 :
262 :
263 : static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
264 : {
265 178996 : atomic_inc(&sctx->bios_in_flight);
266 : }
267 :
268 178988 : static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
269 : {
270 178988 : atomic_dec(&sctx->bios_in_flight);
271 179001 : wake_up(&sctx->list_wait);
272 178985 : }
273 :
274 245 : static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
275 : {
276 492 : while (atomic_read(&fs_info->scrub_pause_req)) {
277 2 : mutex_unlock(&fs_info->scrub_lock);
278 8 : wait_event(fs_info->scrub_pause_wait,
279 : atomic_read(&fs_info->scrub_pause_req) == 0);
280 2 : mutex_lock(&fs_info->scrub_lock);
281 : }
282 245 : }
283 :
284 113 : static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
285 : {
286 113 : atomic_inc(&fs_info->scrubs_paused);
287 113 : wake_up(&fs_info->scrub_pause_wait);
288 :
289 113 : mutex_lock(&fs_info->scrub_lock);
290 113 : __scrub_blocked_if_needed(fs_info);
291 : atomic_dec(&fs_info->scrubs_paused);
292 113 : mutex_unlock(&fs_info->scrub_lock);
293 :
294 113 : wake_up(&fs_info->scrub_pause_wait);
295 113 : }
296 :
297 : /*
298 : * used for workers that require transaction commits (i.e., for the
299 : * NOCOW case)
300 : */
301 1104 : static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
302 : {
303 1104 : struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
304 :
305 : /*
306 : * increment scrubs_running to prevent cancel requests from
307 : * completing as long as a worker is running. we must also
308 : * increment scrubs_paused to prevent deadlocking on pause
309 : * requests used for transactions commits (as the worker uses a
310 : * transaction context). it is safe to regard the worker
311 : * as paused for all matters practical. effectively, we only
312 : * avoid cancellation requests from completing.
313 : */
314 1104 : mutex_lock(&fs_info->scrub_lock);
315 1104 : atomic_inc(&fs_info->scrubs_running);
316 1104 : atomic_inc(&fs_info->scrubs_paused);
317 1104 : mutex_unlock(&fs_info->scrub_lock);
318 :
319 : /*
320 : * check if @scrubs_running=@scrubs_paused condition
321 : * inside wait_event() is not an atomic operation.
322 : * which means we may inc/dec @scrub_running/paused
323 : * at any time. Let's wake up @scrub_pause_wait as
324 : * much as we can to let commit transaction blocked less.
325 : */
326 1104 : wake_up(&fs_info->scrub_pause_wait);
327 :
328 1104 : atomic_inc(&sctx->workers_pending);
329 1104 : }
330 :
331 : /* used for workers that require transaction commits */
332 1104 : static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
333 : {
334 1104 : struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
335 :
336 : /*
337 : * see scrub_pending_trans_workers_inc() why we're pretending
338 : * to be paused in the scrub counters
339 : */
340 1104 : mutex_lock(&fs_info->scrub_lock);
341 1104 : atomic_dec(&fs_info->scrubs_running);
342 1104 : atomic_dec(&fs_info->scrubs_paused);
343 1104 : mutex_unlock(&fs_info->scrub_lock);
344 1104 : atomic_dec(&sctx->workers_pending);
345 1104 : wake_up(&fs_info->scrub_pause_wait);
346 1104 : wake_up(&sctx->list_wait);
347 1104 : }
348 :
349 290492 : static void scrub_free_csums(struct scrub_ctx *sctx)
350 : {
351 939356 : while (!list_empty(&sctx->csum_list)) {
352 : struct btrfs_ordered_sum *sum;
353 33940 : sum = list_first_entry(&sctx->csum_list,
354 : struct btrfs_ordered_sum, list);
355 33940 : list_del(&sum->list);
356 33940 : kfree(sum);
357 : }
358 290492 : }
359 :
360 19 : static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
361 : {
362 : int i;
363 :
364 19 : if (!sctx)
365 19 : return;
366 :
367 19 : scrub_free_wr_ctx(&sctx->wr_ctx);
368 :
369 : /* this can happen when scrub is cancelled */
370 19 : if (sctx->curr != -1) {
371 0 : struct scrub_bio *sbio = sctx->bios[sctx->curr];
372 :
373 0 : for (i = 0; i < sbio->page_count; i++) {
374 0 : WARN_ON(!sbio->pagev[i]->page);
375 0 : scrub_block_put(sbio->pagev[i]->sblock);
376 : }
377 0 : bio_put(sbio->bio);
378 : }
379 :
380 1216 : for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
381 1216 : struct scrub_bio *sbio = sctx->bios[i];
382 :
383 1216 : if (!sbio)
384 : break;
385 1216 : kfree(sbio);
386 : }
387 :
388 19 : scrub_free_csums(sctx);
389 19 : kfree(sctx);
390 : }
391 :
392 : static noinline_for_stack
393 19 : struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
394 : {
395 : struct scrub_ctx *sctx;
396 : int i;
397 19 : struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
398 : int pages_per_rd_bio;
399 : int ret;
400 :
401 : /*
402 : * the setting of pages_per_rd_bio is correct for scrub but might
403 : * be wrong for the dev_replace code where we might read from
404 : * different devices in the initial huge bios. However, that
405 : * code is able to correctly handle the case when adding a page
406 : * to a bio fails.
407 : */
408 19 : if (dev->bdev)
409 19 : pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
410 : bio_get_nr_vecs(dev->bdev));
411 : else
412 : pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
413 19 : sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
414 19 : if (!sctx)
415 : goto nomem;
416 19 : sctx->is_dev_replace = is_dev_replace;
417 19 : sctx->pages_per_rd_bio = pages_per_rd_bio;
418 19 : sctx->curr = -1;
419 19 : sctx->dev_root = dev->dev_root;
420 1216 : for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
421 : struct scrub_bio *sbio;
422 :
423 1216 : sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
424 1216 : if (!sbio)
425 : goto nomem;
426 1216 : sctx->bios[i] = sbio;
427 :
428 1216 : sbio->index = i;
429 1216 : sbio->sctx = sctx;
430 1216 : sbio->page_count = 0;
431 1216 : btrfs_init_work(&sbio->work, btrfs_scrub_helper,
432 : scrub_bio_end_io_worker, NULL, NULL);
433 :
434 1216 : if (i != SCRUB_BIOS_PER_SCTX - 1)
435 1197 : sctx->bios[i]->next_free = i + 1;
436 : else
437 19 : sctx->bios[i]->next_free = -1;
438 : }
439 19 : sctx->first_free = 0;
440 19 : sctx->nodesize = dev->dev_root->nodesize;
441 19 : sctx->leafsize = dev->dev_root->leafsize;
442 19 : sctx->sectorsize = dev->dev_root->sectorsize;
443 : atomic_set(&sctx->bios_in_flight, 0);
444 : atomic_set(&sctx->workers_pending, 0);
445 : atomic_set(&sctx->cancel_req, 0);
446 38 : sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
447 19 : INIT_LIST_HEAD(&sctx->csum_list);
448 :
449 19 : spin_lock_init(&sctx->list_lock);
450 19 : spin_lock_init(&sctx->stat_lock);
451 19 : init_waitqueue_head(&sctx->list_wait);
452 :
453 19 : ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
454 : fs_info->dev_replace.tgtdev, is_dev_replace);
455 19 : if (ret) {
456 0 : scrub_free_ctx(sctx);
457 0 : return ERR_PTR(ret);
458 : }
459 : return sctx;
460 :
461 : nomem:
462 0 : scrub_free_ctx(sctx);
463 : return ERR_PTR(-ENOMEM);
464 : }
465 :
466 0 : static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
467 : void *warn_ctx)
468 : {
469 : u64 isize;
470 : u32 nlink;
471 : int ret;
472 : int i;
473 : struct extent_buffer *eb;
474 : struct btrfs_inode_item *inode_item;
475 : struct scrub_warning *swarn = warn_ctx;
476 0 : struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
477 : struct inode_fs_paths *ipath = NULL;
478 : struct btrfs_root *local_root;
479 : struct btrfs_key root_key;
480 :
481 0 : root_key.objectid = root;
482 0 : root_key.type = BTRFS_ROOT_ITEM_KEY;
483 0 : root_key.offset = (u64)-1;
484 : local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
485 0 : if (IS_ERR(local_root)) {
486 0 : ret = PTR_ERR(local_root);
487 0 : goto err;
488 : }
489 :
490 0 : ret = inode_item_info(inum, 0, local_root, swarn->path);
491 0 : if (ret) {
492 0 : btrfs_release_path(swarn->path);
493 0 : goto err;
494 : }
495 :
496 0 : eb = swarn->path->nodes[0];
497 0 : inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
498 : struct btrfs_inode_item);
499 : isize = btrfs_inode_size(eb, inode_item);
500 : nlink = btrfs_inode_nlink(eb, inode_item);
501 0 : btrfs_release_path(swarn->path);
502 :
503 0 : ipath = init_ipath(4096, local_root, swarn->path);
504 0 : if (IS_ERR(ipath)) {
505 0 : ret = PTR_ERR(ipath);
506 : ipath = NULL;
507 0 : goto err;
508 : }
509 0 : ret = paths_from_inode(inum, ipath);
510 :
511 0 : if (ret < 0)
512 : goto err;
513 :
514 : /*
515 : * we deliberately ignore the bit ipath might have been too small to
516 : * hold all of the paths here
517 : */
518 0 : for (i = 0; i < ipath->fspath->elem_cnt; ++i)
519 0 : printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
520 : "%s, sector %llu, root %llu, inode %llu, offset %llu, "
521 : "length %llu, links %u (path: %s)\n", swarn->errstr,
522 : swarn->logical, rcu_str_deref(swarn->dev->name),
523 : (unsigned long long)swarn->sector, root, inum, offset,
524 : min(isize - offset, (u64)PAGE_SIZE), nlink,
525 : (char *)(unsigned long)ipath->fspath->val[i]);
526 :
527 0 : free_ipath(ipath);
528 0 : return 0;
529 :
530 : err:
531 0 : printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
532 : "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
533 : "resolving failed with ret=%d\n", swarn->errstr,
534 : swarn->logical, rcu_str_deref(swarn->dev->name),
535 : (unsigned long long)swarn->sector, root, inum, offset, ret);
536 :
537 0 : free_ipath(ipath);
538 0 : return 0;
539 : }
540 :
541 0 : static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
542 : {
543 : struct btrfs_device *dev;
544 : struct btrfs_fs_info *fs_info;
545 : struct btrfs_path *path;
546 : struct btrfs_key found_key;
547 : struct extent_buffer *eb;
548 : struct btrfs_extent_item *ei;
549 : struct scrub_warning swarn;
550 0 : unsigned long ptr = 0;
551 : u64 extent_item_pos;
552 0 : u64 flags = 0;
553 : u64 ref_root;
554 : u32 item_size;
555 : u8 ref_level;
556 : const int bufsize = 4096;
557 : int ret;
558 :
559 0 : WARN_ON(sblock->page_count < 1);
560 0 : dev = sblock->pagev[0]->dev;
561 0 : fs_info = sblock->sctx->dev_root->fs_info;
562 :
563 0 : path = btrfs_alloc_path();
564 :
565 0 : swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
566 0 : swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
567 0 : swarn.sector = (sblock->pagev[0]->physical) >> 9;
568 0 : swarn.logical = sblock->pagev[0]->logical;
569 0 : swarn.errstr = errstr;
570 0 : swarn.dev = NULL;
571 0 : swarn.msg_bufsize = bufsize;
572 0 : swarn.scratch_bufsize = bufsize;
573 :
574 0 : if (!path || !swarn.scratch_buf || !swarn.msg_buf)
575 : goto out;
576 :
577 0 : ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
578 : &flags);
579 0 : if (ret < 0)
580 : goto out;
581 :
582 0 : extent_item_pos = swarn.logical - found_key.objectid;
583 0 : swarn.extent_item_size = found_key.offset;
584 :
585 0 : eb = path->nodes[0];
586 0 : ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
587 0 : item_size = btrfs_item_size_nr(eb, path->slots[0]);
588 :
589 0 : if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
590 : do {
591 0 : ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
592 : item_size, &ref_root,
593 : &ref_level);
594 0 : printk_in_rcu(KERN_WARNING
595 : "BTRFS: %s at logical %llu on dev %s, "
596 : "sector %llu: metadata %s (level %d) in tree "
597 : "%llu\n", errstr, swarn.logical,
598 : rcu_str_deref(dev->name),
599 : (unsigned long long)swarn.sector,
600 : ref_level ? "node" : "leaf",
601 : ret < 0 ? -1 : ref_level,
602 : ret < 0 ? -1 : ref_root);
603 0 : } while (ret != 1);
604 0 : btrfs_release_path(path);
605 : } else {
606 0 : btrfs_release_path(path);
607 0 : swarn.path = path;
608 0 : swarn.dev = dev;
609 0 : iterate_extent_inodes(fs_info, found_key.objectid,
610 : extent_item_pos, 1,
611 : scrub_print_warning_inode, &swarn);
612 : }
613 :
614 : out:
615 0 : btrfs_free_path(path);
616 0 : kfree(swarn.scratch_buf);
617 0 : kfree(swarn.msg_buf);
618 0 : }
619 :
620 0 : static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
621 : {
622 : struct page *page = NULL;
623 : unsigned long index;
624 : struct scrub_fixup_nodatasum *fixup = fixup_ctx;
625 : int ret;
626 : int corrected = 0;
627 : struct btrfs_key key;
628 : struct inode *inode = NULL;
629 : struct btrfs_fs_info *fs_info;
630 0 : u64 end = offset + PAGE_SIZE - 1;
631 : struct btrfs_root *local_root;
632 : int srcu_index;
633 :
634 0 : key.objectid = root;
635 0 : key.type = BTRFS_ROOT_ITEM_KEY;
636 0 : key.offset = (u64)-1;
637 :
638 0 : fs_info = fixup->root->fs_info;
639 0 : srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
640 :
641 : local_root = btrfs_read_fs_root_no_name(fs_info, &key);
642 0 : if (IS_ERR(local_root)) {
643 : srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
644 0 : return PTR_ERR(local_root);
645 : }
646 :
647 0 : key.type = BTRFS_INODE_ITEM_KEY;
648 0 : key.objectid = inum;
649 0 : key.offset = 0;
650 0 : inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
651 : srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
652 0 : if (IS_ERR(inode))
653 0 : return PTR_ERR(inode);
654 :
655 0 : index = offset >> PAGE_CACHE_SHIFT;
656 :
657 0 : page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
658 0 : if (!page) {
659 : ret = -ENOMEM;
660 : goto out;
661 : }
662 :
663 0 : if (PageUptodate(page)) {
664 0 : if (PageDirty(page)) {
665 : /*
666 : * we need to write the data to the defect sector. the
667 : * data that was in that sector is not in memory,
668 : * because the page was modified. we must not write the
669 : * modified page to that sector.
670 : *
671 : * TODO: what could be done here: wait for the delalloc
672 : * runner to write out that page (might involve
673 : * COW) and see whether the sector is still
674 : * referenced afterwards.
675 : *
676 : * For the meantime, we'll treat this error
677 : * incorrectable, although there is a chance that a
678 : * later scrub will find the bad sector again and that
679 : * there's no dirty page in memory, then.
680 : */
681 : ret = -EIO;
682 : goto out;
683 : }
684 0 : fs_info = BTRFS_I(inode)->root->fs_info;
685 0 : ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
686 : fixup->logical, page,
687 : fixup->mirror_num);
688 0 : unlock_page(page);
689 0 : corrected = !ret;
690 : } else {
691 : /*
692 : * we need to get good data first. the general readpage path
693 : * will call repair_io_failure for us, we just have to make
694 : * sure we read the bad mirror.
695 : */
696 0 : ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
697 : EXTENT_DAMAGED, GFP_NOFS);
698 0 : if (ret) {
699 : /* set_extent_bits should give proper error */
700 0 : WARN_ON(ret > 0);
701 0 : if (ret > 0)
702 : ret = -EFAULT;
703 : goto out;
704 : }
705 :
706 0 : ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
707 : btrfs_get_extent,
708 : fixup->mirror_num);
709 : wait_on_page_locked(page);
710 :
711 0 : corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
712 : end, EXTENT_DAMAGED, 0, NULL);
713 0 : if (!corrected)
714 0 : clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
715 : EXTENT_DAMAGED, GFP_NOFS);
716 : }
717 :
718 : out:
719 0 : if (page)
720 0 : put_page(page);
721 :
722 0 : iput(inode);
723 :
724 0 : if (ret < 0)
725 : return ret;
726 :
727 0 : if (ret == 0 && corrected) {
728 : /*
729 : * we only need to call readpage for one of the inodes belonging
730 : * to this extent. so make iterate_extent_inodes stop
731 : */
732 : return 1;
733 : }
734 :
735 0 : return -EIO;
736 : }
737 :
738 0 : static void scrub_fixup_nodatasum(struct btrfs_work *work)
739 : {
740 : int ret;
741 : struct scrub_fixup_nodatasum *fixup;
742 : struct scrub_ctx *sctx;
743 : struct btrfs_trans_handle *trans = NULL;
744 : struct btrfs_path *path;
745 : int uncorrectable = 0;
746 :
747 0 : fixup = container_of(work, struct scrub_fixup_nodatasum, work);
748 0 : sctx = fixup->sctx;
749 :
750 0 : path = btrfs_alloc_path();
751 0 : if (!path) {
752 : spin_lock(&sctx->stat_lock);
753 0 : ++sctx->stat.malloc_errors;
754 : spin_unlock(&sctx->stat_lock);
755 : uncorrectable = 1;
756 0 : goto out;
757 : }
758 :
759 0 : trans = btrfs_join_transaction(fixup->root);
760 0 : if (IS_ERR(trans)) {
761 : uncorrectable = 1;
762 : goto out;
763 : }
764 :
765 : /*
766 : * the idea is to trigger a regular read through the standard path. we
767 : * read a page from the (failed) logical address by specifying the
768 : * corresponding copynum of the failed sector. thus, that readpage is
769 : * expected to fail.
770 : * that is the point where on-the-fly error correction will kick in
771 : * (once it's finished) and rewrite the failed sector if a good copy
772 : * can be found.
773 : */
774 0 : ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
775 : path, scrub_fixup_readpage,
776 : fixup);
777 0 : if (ret < 0) {
778 : uncorrectable = 1;
779 : goto out;
780 : }
781 0 : WARN_ON(ret != 1);
782 :
783 : spin_lock(&sctx->stat_lock);
784 0 : ++sctx->stat.corrected_errors;
785 : spin_unlock(&sctx->stat_lock);
786 :
787 : out:
788 0 : if (trans && !IS_ERR(trans))
789 0 : btrfs_end_transaction(trans, fixup->root);
790 0 : if (uncorrectable) {
791 : spin_lock(&sctx->stat_lock);
792 0 : ++sctx->stat.uncorrectable_errors;
793 : spin_unlock(&sctx->stat_lock);
794 0 : btrfs_dev_replace_stats_inc(
795 0 : &sctx->dev_root->fs_info->dev_replace.
796 : num_uncorrectable_read_errors);
797 0 : printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
798 : "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
799 : fixup->logical, rcu_str_deref(fixup->dev->name));
800 : }
801 :
802 0 : btrfs_free_path(path);
803 0 : kfree(fixup);
804 :
805 0 : scrub_pending_trans_workers_dec(sctx);
806 0 : }
807 :
808 : /*
809 : * scrub_handle_errored_block gets called when either verification of the
810 : * pages failed or the bio failed to read, e.g. with EIO. In the latter
811 : * case, this function handles all pages in the bio, even though only one
812 : * may be bad.
813 : * The goal of this function is to repair the errored block by using the
814 : * contents of one of the mirrors.
815 : */
816 0 : static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
817 : {
818 0 : struct scrub_ctx *sctx = sblock_to_check->sctx;
819 : struct btrfs_device *dev;
820 : struct btrfs_fs_info *fs_info;
821 : u64 length;
822 : u64 logical;
823 : u64 generation;
824 : unsigned int failed_mirror_index;
825 : unsigned int is_metadata;
826 : unsigned int have_csum;
827 : u8 *csum;
828 : struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
829 : struct scrub_block *sblock_bad;
830 : int ret;
831 : int mirror_index;
832 : int page_num;
833 : int success;
834 : static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
835 : DEFAULT_RATELIMIT_BURST);
836 :
837 0 : BUG_ON(sblock_to_check->page_count < 1);
838 0 : fs_info = sctx->dev_root->fs_info;
839 0 : if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
840 : /*
841 : * if we find an error in a super block, we just report it.
842 : * They will get written with the next transaction commit
843 : * anyway
844 : */
845 : spin_lock(&sctx->stat_lock);
846 0 : ++sctx->stat.super_errors;
847 : spin_unlock(&sctx->stat_lock);
848 0 : return 0;
849 : }
850 0 : length = sblock_to_check->page_count * PAGE_SIZE;
851 0 : logical = sblock_to_check->pagev[0]->logical;
852 0 : generation = sblock_to_check->pagev[0]->generation;
853 0 : BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
854 0 : failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
855 0 : is_metadata = !(sblock_to_check->pagev[0]->flags &
856 : BTRFS_EXTENT_FLAG_DATA);
857 0 : have_csum = sblock_to_check->pagev[0]->have_csum;
858 0 : csum = sblock_to_check->pagev[0]->csum;
859 0 : dev = sblock_to_check->pagev[0]->dev;
860 :
861 0 : if (sctx->is_dev_replace && !is_metadata && !have_csum) {
862 : sblocks_for_recheck = NULL;
863 : goto nodatasum_case;
864 : }
865 :
866 : /*
867 : * read all mirrors one after the other. This includes to
868 : * re-read the extent or metadata block that failed (that was
869 : * the cause that this fixup code is called) another time,
870 : * page by page this time in order to know which pages
871 : * caused I/O errors and which ones are good (for all mirrors).
872 : * It is the goal to handle the situation when more than one
873 : * mirror contains I/O errors, but the errors do not
874 : * overlap, i.e. the data can be repaired by selecting the
875 : * pages from those mirrors without I/O error on the
876 : * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
877 : * would be that mirror #1 has an I/O error on the first page,
878 : * the second page is good, and mirror #2 has an I/O error on
879 : * the second page, but the first page is good.
880 : * Then the first page of the first mirror can be repaired by
881 : * taking the first page of the second mirror, and the
882 : * second page of the second mirror can be repaired by
883 : * copying the contents of the 2nd page of the 1st mirror.
884 : * One more note: if the pages of one mirror contain I/O
885 : * errors, the checksum cannot be verified. In order to get
886 : * the best data for repairing, the first attempt is to find
887 : * a mirror without I/O errors and with a validated checksum.
888 : * Only if this is not possible, the pages are picked from
889 : * mirrors with I/O errors without considering the checksum.
890 : * If the latter is the case, at the end, the checksum of the
891 : * repaired area is verified in order to correctly maintain
892 : * the statistics.
893 : */
894 :
895 0 : sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
896 : sizeof(*sblocks_for_recheck),
897 : GFP_NOFS);
898 0 : if (!sblocks_for_recheck) {
899 : spin_lock(&sctx->stat_lock);
900 0 : sctx->stat.malloc_errors++;
901 0 : sctx->stat.read_errors++;
902 0 : sctx->stat.uncorrectable_errors++;
903 : spin_unlock(&sctx->stat_lock);
904 0 : btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
905 0 : goto out;
906 : }
907 :
908 : /* setup the context, map the logical blocks and alloc the pages */
909 0 : ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
910 : logical, sblocks_for_recheck);
911 0 : if (ret) {
912 : spin_lock(&sctx->stat_lock);
913 0 : sctx->stat.read_errors++;
914 0 : sctx->stat.uncorrectable_errors++;
915 : spin_unlock(&sctx->stat_lock);
916 0 : btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
917 0 : goto out;
918 : }
919 0 : BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
920 0 : sblock_bad = sblocks_for_recheck + failed_mirror_index;
921 :
922 : /* build and submit the bios for the failed mirror, check checksums */
923 0 : scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
924 0 : csum, generation, sctx->csum_size);
925 :
926 0 : if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
927 : sblock_bad->no_io_error_seen) {
928 : /*
929 : * the error disappeared after reading page by page, or
930 : * the area was part of a huge bio and other parts of the
931 : * bio caused I/O errors, or the block layer merged several
932 : * read requests into one and the error is caused by a
933 : * different bio (usually one of the two latter cases is
934 : * the cause)
935 : */
936 : spin_lock(&sctx->stat_lock);
937 0 : sctx->stat.unverified_errors++;
938 : spin_unlock(&sctx->stat_lock);
939 :
940 0 : if (sctx->is_dev_replace)
941 0 : scrub_write_block_to_dev_replace(sblock_bad);
942 : goto out;
943 : }
944 :
945 0 : if (!sblock_bad->no_io_error_seen) {
946 : spin_lock(&sctx->stat_lock);
947 0 : sctx->stat.read_errors++;
948 : spin_unlock(&sctx->stat_lock);
949 0 : if (__ratelimit(&_rs))
950 0 : scrub_print_warning("i/o error", sblock_to_check);
951 0 : btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
952 0 : } else if (sblock_bad->checksum_error) {
953 : spin_lock(&sctx->stat_lock);
954 0 : sctx->stat.csum_errors++;
955 : spin_unlock(&sctx->stat_lock);
956 0 : if (__ratelimit(&_rs))
957 0 : scrub_print_warning("checksum error", sblock_to_check);
958 0 : btrfs_dev_stat_inc_and_print(dev,
959 : BTRFS_DEV_STAT_CORRUPTION_ERRS);
960 0 : } else if (sblock_bad->header_error) {
961 : spin_lock(&sctx->stat_lock);
962 0 : sctx->stat.verify_errors++;
963 : spin_unlock(&sctx->stat_lock);
964 0 : if (__ratelimit(&_rs))
965 0 : scrub_print_warning("checksum/header error",
966 : sblock_to_check);
967 0 : if (sblock_bad->generation_error)
968 0 : btrfs_dev_stat_inc_and_print(dev,
969 : BTRFS_DEV_STAT_GENERATION_ERRS);
970 : else
971 0 : btrfs_dev_stat_inc_and_print(dev,
972 : BTRFS_DEV_STAT_CORRUPTION_ERRS);
973 : }
974 :
975 0 : if (sctx->readonly) {
976 : ASSERT(!sctx->is_dev_replace);
977 : goto out;
978 : }
979 :
980 0 : if (!is_metadata && !have_csum) {
981 : struct scrub_fixup_nodatasum *fixup_nodatasum;
982 :
983 : nodatasum_case:
984 0 : WARN_ON(sctx->is_dev_replace);
985 :
986 : /*
987 : * !is_metadata and !have_csum, this means that the data
988 : * might not be COW'ed, that it might be modified
989 : * concurrently. The general strategy to work on the
990 : * commit root does not help in the case when COW is not
991 : * used.
992 : */
993 0 : fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
994 0 : if (!fixup_nodatasum)
995 : goto did_not_correct_error;
996 0 : fixup_nodatasum->sctx = sctx;
997 0 : fixup_nodatasum->dev = dev;
998 0 : fixup_nodatasum->logical = logical;
999 0 : fixup_nodatasum->root = fs_info->extent_root;
1000 0 : fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1001 0 : scrub_pending_trans_workers_inc(sctx);
1002 0 : btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1003 : scrub_fixup_nodatasum, NULL, NULL);
1004 0 : btrfs_queue_work(fs_info->scrub_workers,
1005 : &fixup_nodatasum->work);
1006 0 : goto out;
1007 : }
1008 :
1009 : /*
1010 : * now build and submit the bios for the other mirrors, check
1011 : * checksums.
1012 : * First try to pick the mirror which is completely without I/O
1013 : * errors and also does not have a checksum error.
1014 : * If one is found, and if a checksum is present, the full block
1015 : * that is known to contain an error is rewritten. Afterwards
1016 : * the block is known to be corrected.
1017 : * If a mirror is found which is completely correct, and no
1018 : * checksum is present, only those pages are rewritten that had
1019 : * an I/O error in the block to be repaired, since it cannot be
1020 : * determined, which copy of the other pages is better (and it
1021 : * could happen otherwise that a correct page would be
1022 : * overwritten by a bad one).
1023 : */
1024 0 : for (mirror_index = 0;
1025 0 : mirror_index < BTRFS_MAX_MIRRORS &&
1026 0 : sblocks_for_recheck[mirror_index].page_count > 0;
1027 0 : mirror_index++) {
1028 : struct scrub_block *sblock_other;
1029 :
1030 0 : if (mirror_index == failed_mirror_index)
1031 0 : continue;
1032 : sblock_other = sblocks_for_recheck + mirror_index;
1033 :
1034 : /* build and submit the bios, check checksums */
1035 0 : scrub_recheck_block(fs_info, sblock_other, is_metadata,
1036 : have_csum, csum, generation,
1037 0 : sctx->csum_size);
1038 :
1039 0 : if (!sblock_other->header_error &&
1040 0 : !sblock_other->checksum_error &&
1041 : sblock_other->no_io_error_seen) {
1042 0 : if (sctx->is_dev_replace) {
1043 0 : scrub_write_block_to_dev_replace(sblock_other);
1044 : } else {
1045 0 : int force_write = is_metadata || have_csum;
1046 :
1047 0 : ret = scrub_repair_block_from_good_copy(
1048 : sblock_bad, sblock_other,
1049 : force_write);
1050 : }
1051 0 : if (0 == ret)
1052 : goto corrected_error;
1053 : }
1054 : }
1055 :
1056 : /*
1057 : * for dev_replace, pick good pages and write to the target device.
1058 : */
1059 0 : if (sctx->is_dev_replace) {
1060 : success = 1;
1061 0 : for (page_num = 0; page_num < sblock_bad->page_count;
1062 0 : page_num++) {
1063 : int sub_success;
1064 :
1065 : sub_success = 0;
1066 0 : for (mirror_index = 0;
1067 0 : mirror_index < BTRFS_MAX_MIRRORS &&
1068 0 : sblocks_for_recheck[mirror_index].page_count > 0;
1069 0 : mirror_index++) {
1070 : struct scrub_block *sblock_other =
1071 : sblocks_for_recheck + mirror_index;
1072 0 : struct scrub_page *page_other =
1073 : sblock_other->pagev[page_num];
1074 :
1075 0 : if (!page_other->io_error) {
1076 0 : ret = scrub_write_page_to_dev_replace(
1077 : sblock_other, page_num);
1078 0 : if (ret == 0) {
1079 : /* succeeded for this page */
1080 : sub_success = 1;
1081 : break;
1082 : } else {
1083 0 : btrfs_dev_replace_stats_inc(
1084 0 : &sctx->dev_root->
1085 : fs_info->dev_replace.
1086 : num_write_errors);
1087 : }
1088 : }
1089 : }
1090 :
1091 0 : if (!sub_success) {
1092 : /*
1093 : * did not find a mirror to fetch the page
1094 : * from. scrub_write_page_to_dev_replace()
1095 : * handles this case (page->io_error), by
1096 : * filling the block with zeros before
1097 : * submitting the write request
1098 : */
1099 : success = 0;
1100 0 : ret = scrub_write_page_to_dev_replace(
1101 : sblock_bad, page_num);
1102 0 : if (ret)
1103 0 : btrfs_dev_replace_stats_inc(
1104 0 : &sctx->dev_root->fs_info->
1105 : dev_replace.num_write_errors);
1106 : }
1107 : }
1108 :
1109 : goto out;
1110 : }
1111 :
1112 : /*
1113 : * for regular scrub, repair those pages that are errored.
1114 : * In case of I/O errors in the area that is supposed to be
1115 : * repaired, continue by picking good copies of those pages.
1116 : * Select the good pages from mirrors to rewrite bad pages from
1117 : * the area to fix. Afterwards verify the checksum of the block
1118 : * that is supposed to be repaired. This verification step is
1119 : * only done for the purpose of statistic counting and for the
1120 : * final scrub report, whether errors remain.
1121 : * A perfect algorithm could make use of the checksum and try
1122 : * all possible combinations of pages from the different mirrors
1123 : * until the checksum verification succeeds. For example, when
1124 : * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1125 : * of mirror #2 is readable but the final checksum test fails,
1126 : * then the 2nd page of mirror #3 could be tried, whether now
1127 : * the final checksum succeedes. But this would be a rare
1128 : * exception and is therefore not implemented. At least it is
1129 : * avoided that the good copy is overwritten.
1130 : * A more useful improvement would be to pick the sectors
1131 : * without I/O error based on sector sizes (512 bytes on legacy
1132 : * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1133 : * mirror could be repaired by taking 512 byte of a different
1134 : * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1135 : * area are unreadable.
1136 : */
1137 :
1138 : /* can only fix I/O errors from here on */
1139 0 : if (sblock_bad->no_io_error_seen)
1140 : goto did_not_correct_error;
1141 :
1142 : success = 1;
1143 0 : for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1144 0 : struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1145 :
1146 0 : if (!page_bad->io_error)
1147 0 : continue;
1148 :
1149 0 : for (mirror_index = 0;
1150 0 : mirror_index < BTRFS_MAX_MIRRORS &&
1151 0 : sblocks_for_recheck[mirror_index].page_count > 0;
1152 0 : mirror_index++) {
1153 : struct scrub_block *sblock_other = sblocks_for_recheck +
1154 : mirror_index;
1155 0 : struct scrub_page *page_other = sblock_other->pagev[
1156 : page_num];
1157 :
1158 0 : if (!page_other->io_error) {
1159 0 : ret = scrub_repair_page_from_good_copy(
1160 : sblock_bad, sblock_other, page_num, 0);
1161 0 : if (0 == ret) {
1162 0 : page_bad->io_error = 0;
1163 0 : break; /* succeeded for this page */
1164 : }
1165 : }
1166 : }
1167 :
1168 0 : if (page_bad->io_error) {
1169 : /* did not find a mirror to copy the page from */
1170 : success = 0;
1171 : }
1172 : }
1173 :
1174 0 : if (success) {
1175 0 : if (is_metadata || have_csum) {
1176 : /*
1177 : * need to verify the checksum now that all
1178 : * sectors on disk are repaired (the write
1179 : * request for data to be repaired is on its way).
1180 : * Just be lazy and use scrub_recheck_block()
1181 : * which re-reads the data before the checksum
1182 : * is verified, but most likely the data comes out
1183 : * of the page cache.
1184 : */
1185 0 : scrub_recheck_block(fs_info, sblock_bad,
1186 : is_metadata, have_csum, csum,
1187 0 : generation, sctx->csum_size);
1188 0 : if (!sblock_bad->header_error &&
1189 0 : !sblock_bad->checksum_error &&
1190 : sblock_bad->no_io_error_seen)
1191 : goto corrected_error;
1192 : else
1193 : goto did_not_correct_error;
1194 : } else {
1195 : corrected_error:
1196 : spin_lock(&sctx->stat_lock);
1197 0 : sctx->stat.corrected_errors++;
1198 : spin_unlock(&sctx->stat_lock);
1199 0 : printk_ratelimited_in_rcu(KERN_ERR
1200 : "BTRFS: fixed up error at logical %llu on dev %s\n",
1201 : logical, rcu_str_deref(dev->name));
1202 : }
1203 : } else {
1204 : did_not_correct_error:
1205 : spin_lock(&sctx->stat_lock);
1206 0 : sctx->stat.uncorrectable_errors++;
1207 : spin_unlock(&sctx->stat_lock);
1208 0 : printk_ratelimited_in_rcu(KERN_ERR
1209 : "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1210 : logical, rcu_str_deref(dev->name));
1211 : }
1212 :
1213 : out:
1214 0 : if (sblocks_for_recheck) {
1215 0 : for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1216 0 : mirror_index++) {
1217 0 : struct scrub_block *sblock = sblocks_for_recheck +
1218 : mirror_index;
1219 : int page_index;
1220 :
1221 0 : for (page_index = 0; page_index < sblock->page_count;
1222 0 : page_index++) {
1223 0 : sblock->pagev[page_index]->sblock = NULL;
1224 0 : scrub_page_put(sblock->pagev[page_index]);
1225 : }
1226 : }
1227 0 : kfree(sblocks_for_recheck);
1228 : }
1229 :
1230 : return 0;
1231 : }
1232 :
1233 0 : static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1234 : struct btrfs_fs_info *fs_info,
1235 : struct scrub_block *original_sblock,
1236 : u64 length, u64 logical,
1237 : struct scrub_block *sblocks_for_recheck)
1238 : {
1239 : int page_index;
1240 : int mirror_index;
1241 : int ret;
1242 :
1243 : /*
1244 : * note: the two members ref_count and outstanding_pages
1245 : * are not used (and not set) in the blocks that are used for
1246 : * the recheck procedure
1247 : */
1248 :
1249 : page_index = 0;
1250 0 : while (length > 0) {
1251 0 : u64 sublen = min_t(u64, length, PAGE_SIZE);
1252 0 : u64 mapped_length = sublen;
1253 0 : struct btrfs_bio *bbio = NULL;
1254 :
1255 : /*
1256 : * with a length of PAGE_SIZE, each returned stripe
1257 : * represents one mirror
1258 : */
1259 0 : ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1260 : &mapped_length, &bbio, 0);
1261 0 : if (ret || !bbio || mapped_length < sublen) {
1262 0 : kfree(bbio);
1263 0 : return -EIO;
1264 : }
1265 :
1266 0 : BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1267 0 : for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1268 0 : mirror_index++) {
1269 : struct scrub_block *sblock;
1270 : struct scrub_page *page;
1271 :
1272 0 : if (mirror_index >= BTRFS_MAX_MIRRORS)
1273 0 : continue;
1274 :
1275 0 : sblock = sblocks_for_recheck + mirror_index;
1276 0 : sblock->sctx = sctx;
1277 0 : page = kzalloc(sizeof(*page), GFP_NOFS);
1278 0 : if (!page) {
1279 : leave_nomem:
1280 : spin_lock(&sctx->stat_lock);
1281 0 : sctx->stat.malloc_errors++;
1282 : spin_unlock(&sctx->stat_lock);
1283 0 : kfree(bbio);
1284 0 : return -ENOMEM;
1285 : }
1286 : scrub_page_get(page);
1287 0 : sblock->pagev[page_index] = page;
1288 0 : page->logical = logical;
1289 0 : page->physical = bbio->stripes[mirror_index].physical;
1290 0 : BUG_ON(page_index >= original_sblock->page_count);
1291 0 : page->physical_for_dev_replace =
1292 0 : original_sblock->pagev[page_index]->
1293 : physical_for_dev_replace;
1294 : /* for missing devices, dev->bdev is NULL */
1295 0 : page->dev = bbio->stripes[mirror_index].dev;
1296 0 : page->mirror_num = mirror_index + 1;
1297 0 : sblock->page_count++;
1298 0 : page->page = alloc_page(GFP_NOFS);
1299 0 : if (!page->page)
1300 : goto leave_nomem;
1301 : }
1302 0 : kfree(bbio);
1303 0 : length -= sublen;
1304 0 : logical += sublen;
1305 0 : page_index++;
1306 : }
1307 :
1308 : return 0;
1309 : }
1310 :
1311 : /*
1312 : * this function will check the on disk data for checksum errors, header
1313 : * errors and read I/O errors. If any I/O errors happen, the exact pages
1314 : * which are errored are marked as being bad. The goal is to enable scrub
1315 : * to take those pages that are not errored from all the mirrors so that
1316 : * the pages that are errored in the just handled mirror can be repaired.
1317 : */
1318 0 : static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1319 : struct scrub_block *sblock, int is_metadata,
1320 : int have_csum, u8 *csum, u64 generation,
1321 : u16 csum_size)
1322 : {
1323 : int page_num;
1324 :
1325 0 : sblock->no_io_error_seen = 1;
1326 0 : sblock->header_error = 0;
1327 0 : sblock->checksum_error = 0;
1328 :
1329 0 : for (page_num = 0; page_num < sblock->page_count; page_num++) {
1330 : struct bio *bio;
1331 0 : struct scrub_page *page = sblock->pagev[page_num];
1332 :
1333 0 : if (page->dev->bdev == NULL) {
1334 0 : page->io_error = 1;
1335 0 : sblock->no_io_error_seen = 0;
1336 0 : continue;
1337 : }
1338 :
1339 0 : WARN_ON(!page->page);
1340 0 : bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1341 0 : if (!bio) {
1342 0 : page->io_error = 1;
1343 0 : sblock->no_io_error_seen = 0;
1344 0 : continue;
1345 : }
1346 0 : bio->bi_bdev = page->dev->bdev;
1347 0 : bio->bi_iter.bi_sector = page->physical >> 9;
1348 :
1349 0 : bio_add_page(bio, page->page, PAGE_SIZE, 0);
1350 0 : if (btrfsic_submit_bio_wait(READ, bio))
1351 0 : sblock->no_io_error_seen = 0;
1352 :
1353 0 : bio_put(bio);
1354 : }
1355 :
1356 0 : if (sblock->no_io_error_seen)
1357 0 : scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1358 : have_csum, csum, generation,
1359 : csum_size);
1360 :
1361 0 : return;
1362 : }
1363 :
1364 0 : static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1365 : struct scrub_block *sblock,
1366 : int is_metadata, int have_csum,
1367 : const u8 *csum, u64 generation,
1368 : u16 csum_size)
1369 : {
1370 : int page_num;
1371 : u8 calculated_csum[BTRFS_CSUM_SIZE];
1372 : u32 crc = ~(u32)0;
1373 : void *mapped_buffer;
1374 :
1375 0 : WARN_ON(!sblock->pagev[0]->page);
1376 0 : if (is_metadata) {
1377 : struct btrfs_header *h;
1378 :
1379 0 : mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1380 : h = (struct btrfs_header *)mapped_buffer;
1381 :
1382 0 : if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1383 0 : memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1384 0 : memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1385 : BTRFS_UUID_SIZE)) {
1386 0 : sblock->header_error = 1;
1387 0 : } else if (generation != btrfs_stack_header_generation(h)) {
1388 0 : sblock->header_error = 1;
1389 0 : sblock->generation_error = 1;
1390 : }
1391 0 : csum = h->csum;
1392 : } else {
1393 0 : if (!have_csum)
1394 0 : return;
1395 :
1396 0 : mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1397 : }
1398 :
1399 : for (page_num = 0;;) {
1400 0 : if (page_num == 0 && is_metadata)
1401 0 : crc = btrfs_csum_data(
1402 : ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1403 : crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1404 : else
1405 0 : crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1406 :
1407 : kunmap_atomic(mapped_buffer);
1408 0 : page_num++;
1409 0 : if (page_num >= sblock->page_count)
1410 : break;
1411 0 : WARN_ON(!sblock->pagev[page_num]->page);
1412 :
1413 0 : mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1414 0 : }
1415 :
1416 0 : btrfs_csum_final(crc, calculated_csum);
1417 0 : if (memcmp(calculated_csum, csum, csum_size))
1418 0 : sblock->checksum_error = 1;
1419 : }
1420 :
1421 0 : static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1422 : struct scrub_block *sblock_good,
1423 : int force_write)
1424 : {
1425 : int page_num;
1426 : int ret = 0;
1427 :
1428 0 : for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1429 : int ret_sub;
1430 :
1431 0 : ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1432 : sblock_good,
1433 : page_num,
1434 : force_write);
1435 0 : if (ret_sub)
1436 : ret = ret_sub;
1437 : }
1438 :
1439 0 : return ret;
1440 : }
1441 :
1442 0 : static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1443 : struct scrub_block *sblock_good,
1444 : int page_num, int force_write)
1445 : {
1446 0 : struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1447 0 : struct scrub_page *page_good = sblock_good->pagev[page_num];
1448 :
1449 0 : BUG_ON(page_bad->page == NULL);
1450 0 : BUG_ON(page_good->page == NULL);
1451 0 : if (force_write || sblock_bad->header_error ||
1452 0 : sblock_bad->checksum_error || page_bad->io_error) {
1453 : struct bio *bio;
1454 : int ret;
1455 :
1456 0 : if (!page_bad->dev->bdev) {
1457 0 : printk_ratelimited(KERN_WARNING "BTRFS: "
1458 : "scrub_repair_page_from_good_copy(bdev == NULL) "
1459 : "is unexpected!\n");
1460 : return -EIO;
1461 : }
1462 :
1463 0 : bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1464 0 : if (!bio)
1465 : return -EIO;
1466 0 : bio->bi_bdev = page_bad->dev->bdev;
1467 0 : bio->bi_iter.bi_sector = page_bad->physical >> 9;
1468 :
1469 0 : ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1470 0 : if (PAGE_SIZE != ret) {
1471 0 : bio_put(bio);
1472 0 : return -EIO;
1473 : }
1474 :
1475 0 : if (btrfsic_submit_bio_wait(WRITE, bio)) {
1476 0 : btrfs_dev_stat_inc_and_print(page_bad->dev,
1477 : BTRFS_DEV_STAT_WRITE_ERRS);
1478 0 : btrfs_dev_replace_stats_inc(
1479 0 : &sblock_bad->sctx->dev_root->fs_info->
1480 : dev_replace.num_write_errors);
1481 0 : bio_put(bio);
1482 0 : return -EIO;
1483 : }
1484 0 : bio_put(bio);
1485 : }
1486 :
1487 : return 0;
1488 : }
1489 :
1490 1433167 : static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1491 : {
1492 : int page_num;
1493 :
1494 2872847 : for (page_num = 0; page_num < sblock->page_count; page_num++) {
1495 : int ret;
1496 :
1497 1439680 : ret = scrub_write_page_to_dev_replace(sblock, page_num);
1498 1439680 : if (ret)
1499 0 : btrfs_dev_replace_stats_inc(
1500 0 : &sblock->sctx->dev_root->fs_info->dev_replace.
1501 : num_write_errors);
1502 : }
1503 1433167 : }
1504 :
1505 1439680 : static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1506 : int page_num)
1507 : {
1508 1439680 : struct scrub_page *spage = sblock->pagev[page_num];
1509 :
1510 1439680 : BUG_ON(spage->page == NULL);
1511 1439680 : if (spage->io_error) {
1512 : void *mapped_buffer = kmap_atomic(spage->page);
1513 :
1514 0 : memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1515 : flush_dcache_page(spage->page);
1516 : kunmap_atomic(mapped_buffer);
1517 : }
1518 1439680 : return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1519 : }
1520 :
1521 1439680 : static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1522 : struct scrub_page *spage)
1523 : {
1524 : struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1525 : struct scrub_bio *sbio;
1526 : int ret;
1527 :
1528 1439680 : mutex_lock(&wr_ctx->wr_lock);
1529 : again:
1530 1440709 : if (!wr_ctx->wr_curr_bio) {
1531 45829 : wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1532 : GFP_NOFS);
1533 45829 : if (!wr_ctx->wr_curr_bio) {
1534 0 : mutex_unlock(&wr_ctx->wr_lock);
1535 0 : return -ENOMEM;
1536 : }
1537 45829 : wr_ctx->wr_curr_bio->sctx = sctx;
1538 45829 : wr_ctx->wr_curr_bio->page_count = 0;
1539 : }
1540 1440709 : sbio = wr_ctx->wr_curr_bio;
1541 1440709 : if (sbio->page_count == 0) {
1542 : struct bio *bio;
1543 :
1544 45829 : sbio->physical = spage->physical_for_dev_replace;
1545 45829 : sbio->logical = spage->logical;
1546 45829 : sbio->dev = wr_ctx->tgtdev;
1547 45829 : bio = sbio->bio;
1548 45829 : if (!bio) {
1549 45829 : bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1550 45829 : if (!bio) {
1551 0 : mutex_unlock(&wr_ctx->wr_lock);
1552 0 : return -ENOMEM;
1553 : }
1554 45829 : sbio->bio = bio;
1555 : }
1556 :
1557 45829 : bio->bi_private = sbio;
1558 45829 : bio->bi_end_io = scrub_wr_bio_end_io;
1559 45829 : bio->bi_bdev = sbio->dev->bdev;
1560 45829 : bio->bi_iter.bi_sector = sbio->physical >> 9;
1561 45829 : sbio->err = 0;
1562 2789760 : } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1563 2789066 : spage->physical_for_dev_replace ||
1564 1394186 : sbio->logical + sbio->page_count * PAGE_SIZE !=
1565 1394186 : spage->logical) {
1566 1029 : scrub_wr_submit(sctx);
1567 1029 : goto again;
1568 : }
1569 :
1570 1439680 : ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1571 1439680 : if (ret != PAGE_SIZE) {
1572 0 : if (sbio->page_count < 1) {
1573 0 : bio_put(sbio->bio);
1574 0 : sbio->bio = NULL;
1575 0 : mutex_unlock(&wr_ctx->wr_lock);
1576 0 : return -EIO;
1577 : }
1578 0 : scrub_wr_submit(sctx);
1579 0 : goto again;
1580 : }
1581 :
1582 1439680 : sbio->pagev[sbio->page_count] = spage;
1583 : scrub_page_get(spage);
1584 1439680 : sbio->page_count++;
1585 1439680 : if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1586 44494 : scrub_wr_submit(sctx);
1587 1439680 : mutex_unlock(&wr_ctx->wr_lock);
1588 :
1589 1439680 : return 0;
1590 : }
1591 :
1592 47004 : static void scrub_wr_submit(struct scrub_ctx *sctx)
1593 : {
1594 : struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1595 : struct scrub_bio *sbio;
1596 :
1597 47004 : if (!wr_ctx->wr_curr_bio)
1598 47004 : return;
1599 :
1600 : sbio = wr_ctx->wr_curr_bio;
1601 45829 : wr_ctx->wr_curr_bio = NULL;
1602 45829 : WARN_ON(!sbio->bio->bi_bdev);
1603 : scrub_pending_bio_inc(sctx);
1604 : /* process all writes in a single worker thread. Then the block layer
1605 : * orders the requests before sending them to the driver which
1606 : * doubled the write performance on spinning disks when measured
1607 : * with Linux 3.5 */
1608 45829 : btrfsic_submit_bio(WRITE, sbio->bio);
1609 : }
1610 :
1611 45829 : static void scrub_wr_bio_end_io(struct bio *bio, int err)
1612 : {
1613 45829 : struct scrub_bio *sbio = bio->bi_private;
1614 45829 : struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1615 :
1616 45829 : sbio->err = err;
1617 45829 : sbio->bio = bio;
1618 :
1619 45829 : btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1620 : scrub_wr_bio_end_io_worker, NULL, NULL);
1621 45829 : btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1622 45829 : }
1623 :
1624 45829 : static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1625 : {
1626 45829 : struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1627 45829 : struct scrub_ctx *sctx = sbio->sctx;
1628 : int i;
1629 :
1630 45829 : WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1631 45818 : if (sbio->err) {
1632 : struct btrfs_dev_replace *dev_replace =
1633 0 : &sbio->sctx->dev_root->fs_info->dev_replace;
1634 :
1635 0 : for (i = 0; i < sbio->page_count; i++) {
1636 0 : struct scrub_page *spage = sbio->pagev[i];
1637 :
1638 0 : spage->io_error = 1;
1639 0 : btrfs_dev_replace_stats_inc(&dev_replace->
1640 : num_write_errors);
1641 : }
1642 : }
1643 :
1644 1439118 : for (i = 0; i < sbio->page_count; i++)
1645 1439107 : scrub_page_put(sbio->pagev[i]);
1646 :
1647 45829 : bio_put(sbio->bio);
1648 45829 : kfree(sbio);
1649 45828 : scrub_pending_bio_dec(sctx);
1650 45829 : }
1651 :
1652 4170852 : static int scrub_checksum(struct scrub_block *sblock)
1653 : {
1654 : u64 flags;
1655 : int ret;
1656 :
1657 4170852 : WARN_ON(sblock->page_count < 1);
1658 4170935 : flags = sblock->pagev[0]->flags;
1659 : ret = 0;
1660 4170935 : if (flags & BTRFS_EXTENT_FLAG_DATA)
1661 4158619 : ret = scrub_checksum_data(sblock);
1662 12316 : else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1663 12294 : ret = scrub_checksum_tree_block(sblock);
1664 22 : else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1665 22 : (void)scrub_checksum_super(sblock);
1666 : else
1667 0 : WARN_ON(1);
1668 4154895 : if (ret)
1669 0 : scrub_handle_errored_block(sblock);
1670 :
1671 4154895 : return ret;
1672 : }
1673 :
1674 4157172 : static int scrub_checksum_data(struct scrub_block *sblock)
1675 : {
1676 4157172 : struct scrub_ctx *sctx = sblock->sctx;
1677 : u8 csum[BTRFS_CSUM_SIZE];
1678 : u8 *on_disk_csum;
1679 : struct page *page;
1680 : void *buffer;
1681 : u32 crc = ~(u32)0;
1682 : int fail = 0;
1683 : u64 len;
1684 : int index;
1685 :
1686 4157172 : BUG_ON(sblock->page_count < 1);
1687 4157172 : if (!sblock->pagev[0]->have_csum)
1688 : return 0;
1689 :
1690 4154856 : on_disk_csum = sblock->pagev[0]->csum;
1691 4154856 : page = sblock->pagev[0]->page;
1692 : buffer = kmap_atomic(page);
1693 :
1694 4154623 : len = sctx->sectorsize;
1695 : index = 0;
1696 : for (;;) {
1697 4154623 : u64 l = min_t(u64, len, PAGE_SIZE);
1698 :
1699 4154623 : crc = btrfs_csum_data(buffer, crc, l);
1700 : kunmap_atomic(buffer);
1701 4144942 : len -= l;
1702 4144942 : if (len == 0)
1703 : break;
1704 0 : index++;
1705 0 : BUG_ON(index >= sblock->page_count);
1706 0 : BUG_ON(!sblock->pagev[index]->page);
1707 : page = sblock->pagev[index]->page;
1708 : buffer = kmap_atomic(page);
1709 0 : }
1710 :
1711 4144942 : btrfs_csum_final(crc, csum);
1712 4141372 : if (memcmp(csum, on_disk_csum, sctx->csum_size))
1713 : fail = 1;
1714 :
1715 4141372 : return fail;
1716 : }
1717 :
1718 12293 : static int scrub_checksum_tree_block(struct scrub_block *sblock)
1719 : {
1720 12293 : struct scrub_ctx *sctx = sblock->sctx;
1721 : struct btrfs_header *h;
1722 12293 : struct btrfs_root *root = sctx->dev_root;
1723 12293 : struct btrfs_fs_info *fs_info = root->fs_info;
1724 : u8 calculated_csum[BTRFS_CSUM_SIZE];
1725 : u8 on_disk_csum[BTRFS_CSUM_SIZE];
1726 : struct page *page;
1727 : void *mapped_buffer;
1728 : u64 mapped_size;
1729 : void *p;
1730 : u32 crc = ~(u32)0;
1731 : int fail = 0;
1732 : int crc_fail = 0;
1733 : u64 len;
1734 : int index;
1735 :
1736 12293 : BUG_ON(sblock->page_count < 1);
1737 12293 : page = sblock->pagev[0]->page;
1738 : mapped_buffer = kmap_atomic(page);
1739 : h = (struct btrfs_header *)mapped_buffer;
1740 12293 : memcpy(on_disk_csum, h->csum, sctx->csum_size);
1741 :
1742 : /*
1743 : * we don't use the getter functions here, as we
1744 : * a) don't have an extent buffer and
1745 : * b) the page is already kmapped
1746 : */
1747 :
1748 24586 : if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1749 : ++fail;
1750 :
1751 24586 : if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1752 0 : ++fail;
1753 :
1754 12293 : if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1755 0 : ++fail;
1756 :
1757 12293 : if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1758 : BTRFS_UUID_SIZE))
1759 0 : ++fail;
1760 :
1761 12293 : WARN_ON(sctx->nodesize != sctx->leafsize);
1762 12291 : len = sctx->nodesize - BTRFS_CSUM_SIZE;
1763 : mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1764 12291 : p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1765 : index = 0;
1766 : for (;;) {
1767 28332 : u64 l = min_t(u64, len, mapped_size);
1768 :
1769 28332 : crc = btrfs_csum_data(p, crc, l);
1770 : kunmap_atomic(mapped_buffer);
1771 28333 : len -= l;
1772 28333 : if (len == 0)
1773 : break;
1774 16040 : index++;
1775 16040 : BUG_ON(index >= sblock->page_count);
1776 16040 : BUG_ON(!sblock->pagev[index]->page);
1777 : page = sblock->pagev[index]->page;
1778 : mapped_buffer = kmap_atomic(page);
1779 : mapped_size = PAGE_SIZE;
1780 : p = mapped_buffer;
1781 16041 : }
1782 :
1783 12293 : btrfs_csum_final(crc, calculated_csum);
1784 12293 : if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1785 : ++crc_fail;
1786 :
1787 12293 : return fail || crc_fail;
1788 : }
1789 :
1790 22 : static int scrub_checksum_super(struct scrub_block *sblock)
1791 : {
1792 : struct btrfs_super_block *s;
1793 22 : struct scrub_ctx *sctx = sblock->sctx;
1794 22 : struct btrfs_root *root = sctx->dev_root;
1795 22 : struct btrfs_fs_info *fs_info = root->fs_info;
1796 : u8 calculated_csum[BTRFS_CSUM_SIZE];
1797 : u8 on_disk_csum[BTRFS_CSUM_SIZE];
1798 : struct page *page;
1799 : void *mapped_buffer;
1800 : u64 mapped_size;
1801 : void *p;
1802 : u32 crc = ~(u32)0;
1803 : int fail_gen = 0;
1804 : int fail_cor = 0;
1805 : u64 len;
1806 : int index;
1807 :
1808 22 : BUG_ON(sblock->page_count < 1);
1809 22 : page = sblock->pagev[0]->page;
1810 : mapped_buffer = kmap_atomic(page);
1811 : s = (struct btrfs_super_block *)mapped_buffer;
1812 22 : memcpy(on_disk_csum, s->csum, sctx->csum_size);
1813 :
1814 44 : if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1815 : ++fail_cor;
1816 :
1817 44 : if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1818 : ++fail_gen;
1819 :
1820 22 : if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1821 0 : ++fail_cor;
1822 :
1823 : len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1824 : mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1825 22 : p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1826 : index = 0;
1827 : for (;;) {
1828 : u64 l = min_t(u64, len, mapped_size);
1829 :
1830 22 : crc = btrfs_csum_data(p, crc, l);
1831 : kunmap_atomic(mapped_buffer);
1832 : len -= l;
1833 : if (len == 0)
1834 : break;
1835 : index++;
1836 : BUG_ON(index >= sblock->page_count);
1837 : BUG_ON(!sblock->pagev[index]->page);
1838 : page = sblock->pagev[index]->page;
1839 : mapped_buffer = kmap_atomic(page);
1840 : mapped_size = PAGE_SIZE;
1841 : p = mapped_buffer;
1842 : }
1843 :
1844 22 : btrfs_csum_final(crc, calculated_csum);
1845 22 : if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1846 0 : ++fail_cor;
1847 :
1848 22 : if (fail_cor + fail_gen) {
1849 : /*
1850 : * if we find an error in a super block, we just report it.
1851 : * They will get written with the next transaction commit
1852 : * anyway
1853 : */
1854 : spin_lock(&sctx->stat_lock);
1855 0 : ++sctx->stat.super_errors;
1856 : spin_unlock(&sctx->stat_lock);
1857 0 : if (fail_cor)
1858 0 : btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1859 : BTRFS_DEV_STAT_CORRUPTION_ERRS);
1860 : else
1861 0 : btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1862 : BTRFS_DEV_STAT_GENERATION_ERRS);
1863 : }
1864 :
1865 22 : return fail_cor + fail_gen;
1866 : }
1867 :
1868 : static void scrub_block_get(struct scrub_block *sblock)
1869 : {
1870 4183242 : atomic_inc(&sblock->ref_count);
1871 : }
1872 :
1873 8265757 : static void scrub_block_put(struct scrub_block *sblock)
1874 : {
1875 16631983 : if (atomic_dec_and_test(&sblock->ref_count)) {
1876 : int i;
1877 :
1878 4177900 : for (i = 0; i < sblock->page_count; i++)
1879 4189921 : scrub_page_put(sblock->pagev[i]);
1880 4162133 : kfree(sblock);
1881 : }
1882 8353580 : }
1883 :
1884 : static void scrub_page_get(struct scrub_page *spage)
1885 : {
1886 5623179 : atomic_inc(&spage->ref_count);
1887 : }
1888 :
1889 5621356 : static void scrub_page_put(struct scrub_page *spage)
1890 : {
1891 11250015 : if (atomic_dec_and_test(&spage->ref_count)) {
1892 4188979 : if (spage->page)
1893 4189072 : __free_page(spage->page);
1894 4176895 : kfree(spage);
1895 : }
1896 5618558 : }
1897 :
1898 133321 : static void scrub_submit(struct scrub_ctx *sctx)
1899 : {
1900 : struct scrub_bio *sbio;
1901 :
1902 133321 : if (sctx->curr == -1)
1903 133319 : return;
1904 :
1905 133167 : sbio = sctx->bios[sctx->curr];
1906 133167 : sctx->curr = -1;
1907 : scrub_pending_bio_inc(sctx);
1908 :
1909 133172 : if (!sbio->bio->bi_bdev) {
1910 : /*
1911 : * this case should not happen. If btrfs_map_block() is
1912 : * wrong, it could happen for dev-replace operations on
1913 : * missing devices when no mirrors are available, but in
1914 : * this case it should already fail the mount.
1915 : * This case is handled correctly (but _very_ slowly).
1916 : */
1917 0 : printk_ratelimited(KERN_WARNING
1918 : "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
1919 0 : bio_endio(sbio->bio, -EIO);
1920 : } else {
1921 133172 : btrfsic_submit_bio(READ, sbio->bio);
1922 : }
1923 : }
1924 :
1925 4186357 : static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1926 : struct scrub_page *spage)
1927 : {
1928 4186357 : struct scrub_block *sblock = spage->sblock;
1929 : struct scrub_bio *sbio;
1930 : int ret;
1931 :
1932 : again:
1933 : /*
1934 : * grab a fresh bio or wait for one to become available
1935 : */
1936 4366352 : while (sctx->curr == -1) {
1937 : spin_lock(&sctx->list_lock);
1938 176557 : sctx->curr = sctx->first_free;
1939 176557 : if (sctx->curr != -1) {
1940 133170 : sctx->first_free = sctx->bios[sctx->curr]->next_free;
1941 133170 : sctx->bios[sctx->curr]->next_free = -1;
1942 133170 : sctx->bios[sctx->curr]->page_count = 0;
1943 : spin_unlock(&sctx->list_lock);
1944 : } else {
1945 : spin_unlock(&sctx->list_lock);
1946 86774 : wait_event(sctx->list_wait, sctx->first_free != -1);
1947 : }
1948 : }
1949 4189804 : sbio = sctx->bios[sctx->curr];
1950 4189804 : if (sbio->page_count == 0) {
1951 : struct bio *bio;
1952 :
1953 133163 : sbio->physical = spage->physical;
1954 133163 : sbio->logical = spage->logical;
1955 133163 : sbio->dev = spage->dev;
1956 133163 : bio = sbio->bio;
1957 133163 : if (!bio) {
1958 133162 : bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1959 133167 : if (!bio)
1960 : return -ENOMEM;
1961 133167 : sbio->bio = bio;
1962 : }
1963 :
1964 133168 : bio->bi_private = sbio;
1965 133168 : bio->bi_end_io = scrub_bio_end_io;
1966 133168 : bio->bi_bdev = sbio->dev->bdev;
1967 133168 : bio->bi_iter.bi_sector = sbio->physical >> 9;
1968 133168 : sbio->err = 0;
1969 8113282 : } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1970 8111850 : spage->physical ||
1971 4055209 : sbio->logical + sbio->page_count * PAGE_SIZE !=
1972 8108431 : spage->logical ||
1973 4053222 : sbio->dev != spage->dev) {
1974 3446 : scrub_submit(sctx);
1975 3446 : goto again;
1976 : }
1977 :
1978 4186363 : sbio->pagev[sbio->page_count] = spage;
1979 4186363 : ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1980 4183242 : if (ret != PAGE_SIZE) {
1981 0 : if (sbio->page_count < 1) {
1982 0 : bio_put(sbio->bio);
1983 0 : sbio->bio = NULL;
1984 0 : return -EIO;
1985 : }
1986 0 : scrub_submit(sctx);
1987 0 : goto again;
1988 : }
1989 :
1990 : scrub_block_get(sblock); /* one for the page added to the bio */
1991 4192397 : atomic_inc(&sblock->outstanding_pages);
1992 4191633 : sbio->page_count++;
1993 4191633 : if (sbio->page_count == sctx->pages_per_rd_bio)
1994 129630 : scrub_submit(sctx);
1995 :
1996 : return 0;
1997 : }
1998 :
1999 4171571 : static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2000 : u64 physical, struct btrfs_device *dev, u64 flags,
2001 : u64 gen, int mirror_num, u8 *csum, int force,
2002 : u64 physical_for_dev_replace)
2003 : {
2004 : struct scrub_block *sblock;
2005 : int index;
2006 :
2007 4171571 : sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2008 4170451 : if (!sblock) {
2009 : spin_lock(&sctx->stat_lock);
2010 0 : sctx->stat.malloc_errors++;
2011 : spin_unlock(&sctx->stat_lock);
2012 0 : return -ENOMEM;
2013 : }
2014 :
2015 : /* one ref inside this function, plus one for each page added to
2016 : * a bio later on */
2017 : atomic_set(&sblock->ref_count, 1);
2018 4170451 : sblock->sctx = sctx;
2019 4170451 : sblock->no_io_error_seen = 1;
2020 :
2021 8357753 : for (index = 0; len > 0; index++) {
2022 : struct scrub_page *spage;
2023 4186403 : u64 l = min_t(u64, len, PAGE_SIZE);
2024 :
2025 4186403 : spage = kzalloc(sizeof(*spage), GFP_NOFS);
2026 4183396 : if (!spage) {
2027 : leave_nomem:
2028 : spin_lock(&sctx->stat_lock);
2029 0 : sctx->stat.malloc_errors++;
2030 : spin_unlock(&sctx->stat_lock);
2031 0 : scrub_block_put(sblock);
2032 0 : return -ENOMEM;
2033 : }
2034 4183499 : BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2035 : scrub_page_get(spage);
2036 4189291 : sblock->pagev[index] = spage;
2037 4189291 : spage->sblock = sblock;
2038 4189291 : spage->dev = dev;
2039 4189291 : spage->flags = flags;
2040 4189291 : spage->generation = gen;
2041 4189291 : spage->logical = logical;
2042 4189291 : spage->physical = physical;
2043 4189291 : spage->physical_for_dev_replace = physical_for_dev_replace;
2044 4189291 : spage->mirror_num = mirror_num;
2045 4189291 : if (csum) {
2046 4159051 : spage->have_csum = 1;
2047 4159051 : memcpy(spage->csum, csum, sctx->csum_size);
2048 : } else {
2049 30240 : spage->have_csum = 0;
2050 : }
2051 4189291 : sblock->page_count++;
2052 4187255 : spage->page = alloc_page(GFP_NOFS);
2053 4187255 : if (!spage->page)
2054 : goto leave_nomem;
2055 4187302 : len -= l;
2056 4187302 : logical += l;
2057 4187302 : physical += l;
2058 4187302 : physical_for_dev_replace += l;
2059 : }
2060 :
2061 4171350 : WARN_ON(sblock->page_count == 0);
2062 4189163 : for (index = 0; index < sblock->page_count; index++) {
2063 4186344 : struct scrub_page *spage = sblock->pagev[index];
2064 : int ret;
2065 :
2066 4186344 : ret = scrub_add_page_to_rd_bio(sctx, spage);
2067 4189163 : if (ret) {
2068 0 : scrub_block_put(sblock);
2069 0 : return ret;
2070 : }
2071 : }
2072 :
2073 4173276 : if (force)
2074 22 : scrub_submit(sctx);
2075 :
2076 : /* last one frees, either here or in bio completion for last page */
2077 4173276 : scrub_block_put(sblock);
2078 4173312 : return 0;
2079 : }
2080 :
2081 133148 : static void scrub_bio_end_io(struct bio *bio, int err)
2082 : {
2083 133148 : struct scrub_bio *sbio = bio->bi_private;
2084 133148 : struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2085 :
2086 133148 : sbio->err = err;
2087 133148 : sbio->bio = bio;
2088 :
2089 133148 : btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2090 133172 : }
2091 :
2092 133020 : static void scrub_bio_end_io_worker(struct btrfs_work *work)
2093 : {
2094 : struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2095 133020 : struct scrub_ctx *sctx = sbio->sctx;
2096 : int i;
2097 :
2098 133020 : BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2099 133020 : if (sbio->err) {
2100 0 : for (i = 0; i < sbio->page_count; i++) {
2101 0 : struct scrub_page *spage = sbio->pagev[i];
2102 :
2103 0 : spage->io_error = 1;
2104 0 : spage->sblock->no_io_error_seen = 0;
2105 : }
2106 : }
2107 :
2108 : /* now complete the scrub_block items that have all pages completed */
2109 4176440 : for (i = 0; i < sbio->page_count; i++) {
2110 4176303 : struct scrub_page *spage = sbio->pagev[i];
2111 4176303 : struct scrub_block *sblock = spage->sblock;
2112 :
2113 8363084 : if (atomic_dec_and_test(&sblock->outstanding_pages))
2114 4170724 : scrub_block_complete(sblock);
2115 4170030 : scrub_block_put(sblock);
2116 : }
2117 :
2118 133157 : bio_put(sbio->bio);
2119 133157 : sbio->bio = NULL;
2120 : spin_lock(&sctx->list_lock);
2121 133171 : sbio->next_free = sctx->first_free;
2122 133171 : sctx->first_free = sbio->index;
2123 : spin_unlock(&sctx->list_lock);
2124 :
2125 178972 : if (sctx->is_dev_replace &&
2126 : atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2127 1255 : mutex_lock(&sctx->wr_ctx.wr_lock);
2128 1255 : scrub_wr_submit(sctx);
2129 1255 : mutex_unlock(&sctx->wr_ctx.wr_lock);
2130 : }
2131 :
2132 133163 : scrub_pending_bio_dec(sctx);
2133 133162 : }
2134 :
2135 4171380 : static void scrub_block_complete(struct scrub_block *sblock)
2136 : {
2137 4171380 : if (!sblock->no_io_error_seen) {
2138 0 : scrub_handle_errored_block(sblock);
2139 : } else {
2140 : /*
2141 : * if has checksum error, write via repair mechanism in
2142 : * dev replace case, otherwise write here in dev replace
2143 : * case.
2144 : */
2145 4171380 : if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2146 1433167 : scrub_write_block_to_dev_replace(sblock);
2147 : }
2148 4154586 : }
2149 :
2150 4160780 : static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2151 : u8 *csum)
2152 : {
2153 : struct btrfs_ordered_sum *sum = NULL;
2154 : unsigned long index;
2155 : unsigned long num_sectors;
2156 :
2157 8341248 : while (!list_empty(&sctx->csum_list)) {
2158 4167655 : sum = list_first_entry(&sctx->csum_list,
2159 : struct btrfs_ordered_sum, list);
2160 4167655 : if (sum->bytenr > logical)
2161 : return 0;
2162 4167593 : if (sum->bytenr + sum->len > logical)
2163 : break;
2164 :
2165 9874 : ++sctx->stat.csum_discards;
2166 9874 : list_del(&sum->list);
2167 9874 : kfree(sum);
2168 : sum = NULL;
2169 : }
2170 4160688 : if (!sum)
2171 : return 0;
2172 :
2173 4157724 : index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2174 4157724 : num_sectors = sum->len / sctx->sectorsize;
2175 4157724 : memcpy(csum, sum->sums + index, sctx->csum_size);
2176 4157724 : if (index == num_sectors - 1) {
2177 270282 : list_del(&sum->list);
2178 270257 : kfree(sum);
2179 : }
2180 : return 1;
2181 : }
2182 :
2183 : /* scrub extent tries to collect up to 64 kB for each bio */
2184 290439 : static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2185 : u64 physical, struct btrfs_device *dev, u64 flags,
2186 : u64 gen, int mirror_num, u64 physical_for_dev_replace)
2187 : {
2188 : int ret;
2189 : u8 csum[BTRFS_CSUM_SIZE];
2190 : u32 blocksize;
2191 :
2192 290439 : if (flags & BTRFS_EXTENT_FLAG_DATA) {
2193 278145 : blocksize = sctx->sectorsize;
2194 : spin_lock(&sctx->stat_lock);
2195 278215 : sctx->stat.data_extents_scrubbed++;
2196 278215 : sctx->stat.data_bytes_scrubbed += len;
2197 : spin_unlock(&sctx->stat_lock);
2198 12294 : } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2199 12294 : WARN_ON(sctx->nodesize != sctx->leafsize);
2200 12294 : blocksize = sctx->nodesize;
2201 : spin_lock(&sctx->stat_lock);
2202 12294 : sctx->stat.tree_extents_scrubbed++;
2203 12294 : sctx->stat.tree_bytes_scrubbed += len;
2204 : spin_unlock(&sctx->stat_lock);
2205 : } else {
2206 0 : blocksize = sctx->sectorsize;
2207 0 : WARN_ON(1);
2208 : }
2209 :
2210 4464504 : while (len) {
2211 4173162 : u64 l = min_t(u64, len, blocksize);
2212 : int have_csum = 0;
2213 :
2214 4173162 : if (flags & BTRFS_EXTENT_FLAG_DATA) {
2215 : /* push csums to sbio */
2216 4160825 : have_csum = scrub_find_csum(sctx, logical, l, csum);
2217 4160190 : if (have_csum == 0)
2218 2976 : ++sctx->stat.no_csum;
2219 4160190 : if (sctx->is_dev_replace && !have_csum) {
2220 1104 : ret = copy_nocow_pages(sctx, logical, l,
2221 : mirror_num,
2222 : physical_for_dev_replace);
2223 1104 : goto behind_scrub_pages;
2224 : }
2225 : }
2226 4171423 : ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2227 : mirror_num, have_csum ? csum : NULL, 0,
2228 : physical_for_dev_replace);
2229 : behind_scrub_pages:
2230 4174005 : if (ret)
2231 : return ret;
2232 4174005 : len -= l;
2233 4174005 : logical += l;
2234 4174005 : physical += l;
2235 4174005 : physical_for_dev_replace += l;
2236 : }
2237 : return 0;
2238 : }
2239 :
2240 : /*
2241 : * Given a physical address, this will calculate it's
2242 : * logical offset. if this is a parity stripe, it will return
2243 : * the most left data stripe's logical offset.
2244 : *
2245 : * return 0 if it is a data stripe, 1 means parity stripe.
2246 : */
2247 0 : static int get_raid56_logic_offset(u64 physical, int num,
2248 : struct map_lookup *map, u64 *offset)
2249 : {
2250 : int i;
2251 : int j = 0;
2252 : u64 stripe_nr;
2253 : u64 last_offset;
2254 : int stripe_index;
2255 : int rot;
2256 :
2257 0 : last_offset = (physical - map->stripes[num].physical) *
2258 : nr_data_stripes(map);
2259 0 : *offset = last_offset;
2260 0 : for (i = 0; i < nr_data_stripes(map); i++) {
2261 0 : *offset = last_offset + i * map->stripe_len;
2262 :
2263 : stripe_nr = *offset;
2264 0 : do_div(stripe_nr, map->stripe_len);
2265 0 : do_div(stripe_nr, nr_data_stripes(map));
2266 :
2267 : /* Work out the disk rotation on this stripe-set */
2268 0 : rot = do_div(stripe_nr, map->num_stripes);
2269 : /* calculate which stripe this data locates */
2270 0 : rot += i;
2271 0 : stripe_index = rot % map->num_stripes;
2272 0 : if (stripe_index == num)
2273 : return 0;
2274 0 : if (stripe_index < num)
2275 0 : j++;
2276 : }
2277 0 : *offset = last_offset + j * map->stripe_len;
2278 0 : return 1;
2279 : }
2280 :
2281 113 : static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2282 : struct map_lookup *map,
2283 : struct btrfs_device *scrub_dev,
2284 : int num, u64 base, u64 length,
2285 : int is_dev_replace)
2286 : {
2287 : struct btrfs_path *path;
2288 9678 : struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2289 113 : struct btrfs_root *root = fs_info->extent_root;
2290 113 : struct btrfs_root *csum_root = fs_info->csum_root;
2291 : struct btrfs_extent_item *extent;
2292 : struct blk_plug plug;
2293 : u64 flags;
2294 : int ret;
2295 : int slot;
2296 : u64 nstripes;
2297 48725 : struct extent_buffer *l;
2298 : struct btrfs_key key;
2299 : u64 physical;
2300 : u64 logical;
2301 : u64 logic_end;
2302 : u64 physical_end;
2303 : u64 generation;
2304 : int mirror_num;
2305 : struct reada_control *reada1;
2306 : struct reada_control *reada2;
2307 : struct btrfs_key key_start;
2308 : struct btrfs_key key_end;
2309 : u64 increment = map->stripe_len;
2310 : u64 offset;
2311 : u64 extent_logical;
2312 : u64 extent_physical;
2313 : u64 extent_len;
2314 : struct btrfs_device *extent_dev;
2315 : int extent_mirror_num;
2316 : int stop_loop = 0;
2317 :
2318 : nstripes = length;
2319 113 : physical = map->stripes[num].physical;
2320 113 : offset = 0;
2321 113 : do_div(nstripes, map->stripe_len);
2322 113 : if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2323 9 : offset = map->stripe_len * num;
2324 9 : increment = map->stripe_len * map->num_stripes;
2325 : mirror_num = 1;
2326 104 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2327 0 : int factor = map->num_stripes / map->sub_stripes;
2328 0 : offset = map->stripe_len * (num / map->sub_stripes);
2329 0 : increment = map->stripe_len * factor;
2330 0 : mirror_num = num % map->sub_stripes + 1;
2331 104 : } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2332 30 : increment = map->stripe_len;
2333 30 : mirror_num = num % map->num_stripes + 1;
2334 74 : } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2335 24 : increment = map->stripe_len;
2336 24 : mirror_num = num % map->num_stripes + 1;
2337 50 : } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2338 : BTRFS_BLOCK_GROUP_RAID6)) {
2339 0 : get_raid56_logic_offset(physical, num, map, &offset);
2340 0 : increment = map->stripe_len * nr_data_stripes(map);
2341 : mirror_num = 1;
2342 : } else {
2343 50 : increment = map->stripe_len;
2344 : mirror_num = 1;
2345 : }
2346 :
2347 113 : path = btrfs_alloc_path();
2348 113 : if (!path)
2349 : return -ENOMEM;
2350 :
2351 : /*
2352 : * work on commit root. The related disk blocks are static as
2353 : * long as COW is applied. This means, it is save to rewrite
2354 : * them to repair disk errors without any race conditions
2355 : */
2356 113 : path->search_commit_root = 1;
2357 113 : path->skip_locking = 1;
2358 :
2359 : /*
2360 : * trigger the readahead for extent tree csum tree and wait for
2361 : * completion. During readahead, the scrub is officially paused
2362 : * to not hold off transaction commits
2363 : */
2364 113 : logical = base + offset;
2365 113 : physical_end = physical + nstripes * map->stripe_len;
2366 113 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2367 : BTRFS_BLOCK_GROUP_RAID6)) {
2368 0 : get_raid56_logic_offset(physical_end, num,
2369 : map, &logic_end);
2370 0 : logic_end += base;
2371 : } else {
2372 113 : logic_end = logical + increment * nstripes;
2373 : }
2374 113 : wait_event(sctx->list_wait,
2375 : atomic_read(&sctx->bios_in_flight) == 0);
2376 113 : scrub_blocked_if_needed(fs_info);
2377 :
2378 : /* FIXME it might be better to start readahead at commit root */
2379 113 : key_start.objectid = logical;
2380 113 : key_start.type = BTRFS_EXTENT_ITEM_KEY;
2381 113 : key_start.offset = (u64)0;
2382 113 : key_end.objectid = logic_end;
2383 113 : key_end.type = BTRFS_METADATA_ITEM_KEY;
2384 113 : key_end.offset = (u64)-1;
2385 113 : reada1 = btrfs_reada_add(root, &key_start, &key_end);
2386 :
2387 113 : key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2388 113 : key_start.type = BTRFS_EXTENT_CSUM_KEY;
2389 113 : key_start.offset = logical;
2390 113 : key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2391 113 : key_end.type = BTRFS_EXTENT_CSUM_KEY;
2392 113 : key_end.offset = logic_end;
2393 113 : reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2394 :
2395 113 : if (!IS_ERR(reada1))
2396 113 : btrfs_reada_wait(reada1);
2397 113 : if (!IS_ERR(reada2))
2398 113 : btrfs_reada_wait(reada2);
2399 :
2400 :
2401 : /*
2402 : * collect all data csums for the stripe to avoid seeking during
2403 : * the scrub. This might currently (crc32) end up to be about 1MB
2404 : */
2405 113 : blk_start_plug(&plug);
2406 :
2407 : /*
2408 : * now find all extents for each stripe and scrub them
2409 : */
2410 : ret = 0;
2411 9679 : while (physical < physical_end) {
2412 : /* for raid56, we skip parity stripe */
2413 9566 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2414 : BTRFS_BLOCK_GROUP_RAID6)) {
2415 0 : ret = get_raid56_logic_offset(physical, num,
2416 : map, &logical);
2417 0 : logical += base;
2418 0 : if (ret)
2419 : goto skip;
2420 : }
2421 : /*
2422 : * canceled?
2423 : */
2424 19131 : if (atomic_read(&fs_info->scrub_cancel_req) ||
2425 : atomic_read(&sctx->cancel_req)) {
2426 : ret = -ECANCELED;
2427 : goto out;
2428 : }
2429 : /*
2430 : * check to see if we have to pause
2431 : */
2432 9565 : if (atomic_read(&fs_info->scrub_pause_req)) {
2433 : /* push queued extents */
2434 : atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2435 0 : scrub_submit(sctx);
2436 0 : mutex_lock(&sctx->wr_ctx.wr_lock);
2437 0 : scrub_wr_submit(sctx);
2438 0 : mutex_unlock(&sctx->wr_ctx.wr_lock);
2439 0 : wait_event(sctx->list_wait,
2440 : atomic_read(&sctx->bios_in_flight) == 0);
2441 : atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2442 0 : scrub_blocked_if_needed(fs_info);
2443 : }
2444 :
2445 9565 : if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2446 0 : key.type = BTRFS_METADATA_ITEM_KEY;
2447 : else
2448 9565 : key.type = BTRFS_EXTENT_ITEM_KEY;
2449 9565 : key.objectid = logical;
2450 9565 : key.offset = (u64)-1;
2451 :
2452 9565 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2453 9564 : if (ret < 0)
2454 : goto out;
2455 :
2456 9564 : if (ret > 0) {
2457 9564 : ret = btrfs_previous_extent_item(root, path, 0);
2458 9565 : if (ret < 0)
2459 : goto out;
2460 9565 : if (ret > 0) {
2461 : /* there's no smaller item, so stick with the
2462 : * larger one */
2463 42 : btrfs_release_path(path);
2464 42 : ret = btrfs_search_slot(NULL, root, &key,
2465 : path, 0, 0);
2466 42 : if (ret < 0)
2467 : goto out;
2468 : }
2469 : }
2470 :
2471 : stop_loop = 0;
2472 : while (1) {
2473 : u64 bytes;
2474 :
2475 48725 : l = path->nodes[0];
2476 48725 : slot = path->slots[0];
2477 97450 : if (slot >= btrfs_header_nritems(l)) {
2478 393 : ret = btrfs_next_leaf(root, path);
2479 393 : if (ret == 0)
2480 373 : continue;
2481 20 : if (ret < 0)
2482 : goto out;
2483 :
2484 : stop_loop = 1;
2485 : break;
2486 : }
2487 48332 : btrfs_item_key_to_cpu(l, &key, slot);
2488 :
2489 48335 : if (key.type == BTRFS_METADATA_ITEM_KEY)
2490 0 : bytes = root->leafsize;
2491 : else
2492 48335 : bytes = key.offset;
2493 :
2494 48335 : if (key.objectid + bytes <= logical)
2495 : goto next;
2496 :
2497 47282 : if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2498 : key.type != BTRFS_METADATA_ITEM_KEY)
2499 : goto next;
2500 :
2501 47058 : if (key.objectid >= logical + map->stripe_len) {
2502 : /* out of this device extent */
2503 9545 : if (key.objectid >= logic_end)
2504 : stop_loop = 1;
2505 : break;
2506 : }
2507 :
2508 37512 : extent = btrfs_item_ptr(l, slot,
2509 : struct btrfs_extent_item);
2510 : flags = btrfs_extent_flags(l, extent);
2511 : generation = btrfs_extent_generation(l, extent);
2512 :
2513 37766 : if (key.objectid < logical &&
2514 258 : (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2515 0 : btrfs_err(fs_info,
2516 : "scrub: tree block %llu spanning "
2517 : "stripes, ignored. logical=%llu",
2518 : key.objectid, logical);
2519 0 : goto next;
2520 : }
2521 :
2522 : again:
2523 290463 : extent_logical = key.objectid;
2524 : extent_len = bytes;
2525 :
2526 : /*
2527 : * trim extent to this stripe
2528 : */
2529 290463 : if (extent_logical < logical) {
2530 253210 : extent_len -= logical - extent_logical;
2531 : extent_logical = logical;
2532 : }
2533 580926 : if (extent_logical + extent_len >
2534 290463 : logical + map->stripe_len) {
2535 253207 : extent_len = logical + map->stripe_len -
2536 : extent_logical;
2537 : }
2538 :
2539 290463 : extent_physical = extent_logical - logical + physical;
2540 290463 : extent_dev = scrub_dev;
2541 290463 : extent_mirror_num = mirror_num;
2542 290463 : if (is_dev_replace)
2543 102237 : scrub_remap_extent(fs_info, extent_logical,
2544 : extent_len, &extent_physical,
2545 : &extent_dev,
2546 : &extent_mirror_num);
2547 :
2548 580926 : ret = btrfs_lookup_csums_range(csum_root, logical,
2549 290463 : logical + map->stripe_len - 1,
2550 : &sctx->csum_list, 1);
2551 290450 : if (ret)
2552 : goto out;
2553 :
2554 290450 : ret = scrub_extent(sctx, extent_logical, extent_len,
2555 : extent_physical, extent_dev, flags,
2556 : generation, extent_mirror_num,
2557 290450 : extent_logical - logical + physical);
2558 290473 : if (ret)
2559 : goto out;
2560 :
2561 290473 : scrub_free_csums(sctx);
2562 580938 : if (extent_logical + extent_len <
2563 290469 : key.objectid + bytes) {
2564 253217 : if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2565 : BTRFS_BLOCK_GROUP_RAID6)) {
2566 : /*
2567 : * loop until we find next data stripe
2568 : * or we have finished all stripes.
2569 : */
2570 : do {
2571 2 : physical += map->stripe_len;
2572 2 : ret = get_raid56_logic_offset(
2573 : physical, num,
2574 : map, &logical);
2575 0 : logical += base;
2576 0 : } while (physical < physical_end && ret);
2577 : } else {
2578 253217 : physical += map->stripe_len;
2579 253217 : logical += increment;
2580 : }
2581 253215 : if (logical < key.objectid + bytes) {
2582 252956 : cond_resched();
2583 252955 : goto again;
2584 : }
2585 :
2586 259 : if (physical >= physical_end) {
2587 : stop_loop = 1;
2588 : break;
2589 : }
2590 : }
2591 : next:
2592 38788 : path->slots[0]++;
2593 : }
2594 9565 : btrfs_release_path(path);
2595 : skip:
2596 9564 : logical += increment;
2597 9564 : physical += map->stripe_len;
2598 : spin_lock(&sctx->stat_lock);
2599 9565 : if (stop_loop)
2600 112 : sctx->stat.last_physical = map->stripes[num].physical +
2601 : length;
2602 : else
2603 9453 : sctx->stat.last_physical = physical;
2604 : spin_unlock(&sctx->stat_lock);
2605 9565 : if (stop_loop)
2606 : break;
2607 : }
2608 : out:
2609 : /* push queued extents */
2610 114 : scrub_submit(sctx);
2611 113 : mutex_lock(&sctx->wr_ctx.wr_lock);
2612 113 : scrub_wr_submit(sctx);
2613 113 : mutex_unlock(&sctx->wr_ctx.wr_lock);
2614 :
2615 113 : blk_finish_plug(&plug);
2616 113 : btrfs_free_path(path);
2617 113 : return ret < 0 ? ret : 0;
2618 : }
2619 :
2620 113 : static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2621 : struct btrfs_device *scrub_dev,
2622 : u64 chunk_tree, u64 chunk_objectid,
2623 : u64 chunk_offset, u64 length,
2624 : u64 dev_offset, int is_dev_replace)
2625 : {
2626 : struct btrfs_mapping_tree *map_tree =
2627 113 : &sctx->dev_root->fs_info->mapping_tree;
2628 : struct map_lookup *map;
2629 : struct extent_map *em;
2630 : int i;
2631 : int ret = 0;
2632 :
2633 113 : read_lock(&map_tree->map_tree.lock);
2634 113 : em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2635 : read_unlock(&map_tree->map_tree.lock);
2636 :
2637 113 : if (!em)
2638 : return -EINVAL;
2639 :
2640 113 : map = (struct map_lookup *)em->bdev;
2641 113 : if (em->start != chunk_offset)
2642 : goto out;
2643 :
2644 113 : if (em->len < length)
2645 : goto out;
2646 :
2647 175 : for (i = 0; i < map->num_stripes; ++i) {
2648 313 : if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2649 137 : map->stripes[i].physical == dev_offset) {
2650 113 : ret = scrub_stripe(sctx, map, scrub_dev, i,
2651 : chunk_offset, length,
2652 : is_dev_replace);
2653 113 : if (ret)
2654 : goto out;
2655 : }
2656 : }
2657 : out:
2658 113 : free_extent_map(em);
2659 :
2660 : return ret;
2661 : }
2662 :
2663 : static noinline_for_stack
2664 19 : int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2665 : struct btrfs_device *scrub_dev, u64 start, u64 end,
2666 : int is_dev_replace)
2667 : {
2668 : struct btrfs_dev_extent *dev_extent = NULL;
2669 : struct btrfs_path *path;
2670 19 : struct btrfs_root *root = sctx->dev_root;
2671 19 : struct btrfs_fs_info *fs_info = root->fs_info;
2672 : u64 length;
2673 : u64 chunk_tree;
2674 : u64 chunk_objectid;
2675 : u64 chunk_offset;
2676 : int ret;
2677 : int slot;
2678 : struct extent_buffer *l;
2679 : struct btrfs_key key;
2680 : struct btrfs_key found_key;
2681 : struct btrfs_block_group_cache *cache;
2682 : struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2683 :
2684 19 : path = btrfs_alloc_path();
2685 19 : if (!path)
2686 : return -ENOMEM;
2687 :
2688 19 : path->reada = 2;
2689 19 : path->search_commit_root = 1;
2690 19 : path->skip_locking = 1;
2691 :
2692 19 : key.objectid = scrub_dev->devid;
2693 19 : key.offset = 0ull;
2694 19 : key.type = BTRFS_DEV_EXTENT_KEY;
2695 :
2696 : while (1) {
2697 131 : ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2698 131 : if (ret < 0)
2699 : break;
2700 131 : if (ret > 0) {
2701 46 : if (path->slots[0] >=
2702 23 : btrfs_header_nritems(path->nodes[0])) {
2703 14 : ret = btrfs_next_leaf(root, path);
2704 14 : if (ret)
2705 : break;
2706 : }
2707 : }
2708 :
2709 117 : l = path->nodes[0];
2710 117 : slot = path->slots[0];
2711 :
2712 117 : btrfs_item_key_to_cpu(l, &found_key, slot);
2713 :
2714 117 : if (found_key.objectid != scrub_dev->devid)
2715 : break;
2716 :
2717 113 : if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2718 : break;
2719 :
2720 113 : if (found_key.offset >= end)
2721 : break;
2722 :
2723 113 : if (found_key.offset < key.offset)
2724 : break;
2725 :
2726 113 : dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2727 : length = btrfs_dev_extent_length(l, dev_extent);
2728 :
2729 113 : if (found_key.offset + length <= start)
2730 : goto skip;
2731 :
2732 : chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2733 : chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2734 : chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2735 :
2736 : /*
2737 : * get a reference on the corresponding block group to prevent
2738 : * the chunk from going away while we scrub it
2739 : */
2740 113 : cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2741 :
2742 : /* some chunks are removed but not committed to disk yet,
2743 : * continue scrubbing */
2744 113 : if (!cache)
2745 : goto skip;
2746 :
2747 113 : dev_replace->cursor_right = found_key.offset + length;
2748 113 : dev_replace->cursor_left = found_key.offset;
2749 113 : dev_replace->item_needs_writeback = 1;
2750 113 : ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2751 : chunk_offset, length, found_key.offset,
2752 : is_dev_replace);
2753 :
2754 : /*
2755 : * flush, submit all pending read and write bios, afterwards
2756 : * wait for them.
2757 : * Note that in the dev replace case, a read request causes
2758 : * write requests that are submitted in the read completion
2759 : * worker. Therefore in the current situation, it is required
2760 : * that all write requests are flushed, so that all read and
2761 : * write requests are really completed when bios_in_flight
2762 : * changes to 0.
2763 : */
2764 : atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2765 113 : scrub_submit(sctx);
2766 113 : mutex_lock(&sctx->wr_ctx.wr_lock);
2767 113 : scrub_wr_submit(sctx);
2768 113 : mutex_unlock(&sctx->wr_ctx.wr_lock);
2769 :
2770 3451 : wait_event(sctx->list_wait,
2771 : atomic_read(&sctx->bios_in_flight) == 0);
2772 113 : atomic_inc(&fs_info->scrubs_paused);
2773 113 : wake_up(&fs_info->scrub_pause_wait);
2774 :
2775 : /*
2776 : * must be called before we decrease @scrub_paused.
2777 : * make sure we don't block transaction commit while
2778 : * we are waiting pending workers finished.
2779 : */
2780 800 : wait_event(sctx->list_wait,
2781 : atomic_read(&sctx->workers_pending) == 0);
2782 : atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2783 :
2784 113 : mutex_lock(&fs_info->scrub_lock);
2785 113 : __scrub_blocked_if_needed(fs_info);
2786 : atomic_dec(&fs_info->scrubs_paused);
2787 113 : mutex_unlock(&fs_info->scrub_lock);
2788 113 : wake_up(&fs_info->scrub_pause_wait);
2789 :
2790 113 : btrfs_put_block_group(cache);
2791 113 : if (ret)
2792 : break;
2793 158 : if (is_dev_replace &&
2794 : atomic64_read(&dev_replace->num_write_errors) > 0) {
2795 : ret = -EIO;
2796 : break;
2797 : }
2798 112 : if (sctx->stat.malloc_errors > 0) {
2799 : ret = -ENOMEM;
2800 : break;
2801 : }
2802 :
2803 112 : dev_replace->cursor_left = dev_replace->cursor_right;
2804 112 : dev_replace->item_needs_writeback = 1;
2805 : skip:
2806 112 : key.offset = found_key.offset + length;
2807 112 : btrfs_release_path(path);
2808 112 : }
2809 :
2810 19 : btrfs_free_path(path);
2811 :
2812 : /*
2813 : * ret can still be 1 from search_slot or next_leaf,
2814 : * that's not an error
2815 : */
2816 19 : return ret < 0 ? ret : 0;
2817 : }
2818 :
2819 11 : static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2820 : struct btrfs_device *scrub_dev)
2821 : {
2822 : int i;
2823 : u64 bytenr;
2824 : u64 gen;
2825 : int ret;
2826 11 : struct btrfs_root *root = sctx->dev_root;
2827 :
2828 22 : if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2829 : return -EIO;
2830 :
2831 11 : gen = root->fs_info->last_trans_committed;
2832 :
2833 33 : for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2834 : bytenr = btrfs_sb_offset(i);
2835 33 : if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2836 : break;
2837 :
2838 22 : ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2839 : scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2840 : NULL, 1, bytenr);
2841 22 : if (ret)
2842 : return ret;
2843 : }
2844 48 : wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2845 :
2846 : return 0;
2847 : }
2848 :
2849 : /*
2850 : * get a reference count on fs_info->scrub_workers. start worker if necessary
2851 : */
2852 19 : static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2853 : int is_dev_replace)
2854 : {
2855 : int ret = 0;
2856 : int flags = WQ_FREEZABLE | WQ_UNBOUND;
2857 19 : int max_active = fs_info->thread_pool_size;
2858 :
2859 19 : if (fs_info->scrub_workers_refcnt == 0) {
2860 16 : if (is_dev_replace)
2861 8 : fs_info->scrub_workers =
2862 8 : btrfs_alloc_workqueue("btrfs-scrub", flags,
2863 : 1, 4);
2864 : else
2865 8 : fs_info->scrub_workers =
2866 8 : btrfs_alloc_workqueue("btrfs-scrub", flags,
2867 : max_active, 4);
2868 16 : if (!fs_info->scrub_workers) {
2869 : ret = -ENOMEM;
2870 : goto out;
2871 : }
2872 16 : fs_info->scrub_wr_completion_workers =
2873 16 : btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
2874 : max_active, 2);
2875 16 : if (!fs_info->scrub_wr_completion_workers) {
2876 : ret = -ENOMEM;
2877 : goto out;
2878 : }
2879 16 : fs_info->scrub_nocow_workers =
2880 16 : btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
2881 16 : if (!fs_info->scrub_nocow_workers) {
2882 : ret = -ENOMEM;
2883 : goto out;
2884 : }
2885 : }
2886 19 : ++fs_info->scrub_workers_refcnt;
2887 : out:
2888 19 : return ret;
2889 : }
2890 :
2891 19 : static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2892 : {
2893 19 : if (--fs_info->scrub_workers_refcnt == 0) {
2894 16 : btrfs_destroy_workqueue(fs_info->scrub_workers);
2895 16 : btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
2896 16 : btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
2897 : }
2898 19 : WARN_ON(fs_info->scrub_workers_refcnt < 0);
2899 19 : }
2900 :
2901 19 : int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2902 : u64 end, struct btrfs_scrub_progress *progress,
2903 : int readonly, int is_dev_replace)
2904 : {
2905 : struct scrub_ctx *sctx;
2906 : int ret;
2907 19 : struct btrfs_device *dev;
2908 : struct rcu_string *name;
2909 :
2910 19 : if (btrfs_fs_closing(fs_info))
2911 : return -EINVAL;
2912 :
2913 : /*
2914 : * check some assumptions
2915 : */
2916 19 : if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2917 0 : btrfs_err(fs_info,
2918 : "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2919 : fs_info->chunk_root->nodesize,
2920 : fs_info->chunk_root->leafsize);
2921 0 : return -EINVAL;
2922 : }
2923 :
2924 19 : if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2925 : /*
2926 : * in this case scrub is unable to calculate the checksum
2927 : * the way scrub is implemented. Do not handle this
2928 : * situation at all because it won't ever happen.
2929 : */
2930 0 : btrfs_err(fs_info,
2931 : "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
2932 : fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2933 0 : return -EINVAL;
2934 : }
2935 :
2936 19 : if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2937 : /* not supported for data w/o checksums */
2938 0 : btrfs_err(fs_info,
2939 : "scrub: size assumption sectorsize != PAGE_SIZE "
2940 : "(%d != %lu) fails",
2941 : fs_info->chunk_root->sectorsize, PAGE_SIZE);
2942 0 : return -EINVAL;
2943 : }
2944 :
2945 19 : if (fs_info->chunk_root->nodesize >
2946 19 : PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2947 : fs_info->chunk_root->sectorsize >
2948 : PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2949 : /*
2950 : * would exhaust the array bounds of pagev member in
2951 : * struct scrub_block
2952 : */
2953 0 : btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
2954 : "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
2955 : fs_info->chunk_root->nodesize,
2956 : SCRUB_MAX_PAGES_PER_BLOCK,
2957 : fs_info->chunk_root->sectorsize,
2958 : SCRUB_MAX_PAGES_PER_BLOCK);
2959 0 : return -EINVAL;
2960 : }
2961 :
2962 :
2963 19 : mutex_lock(&fs_info->fs_devices->device_list_mutex);
2964 19 : dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2965 19 : if (!dev || (dev->missing && !is_dev_replace)) {
2966 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2967 0 : return -ENODEV;
2968 : }
2969 :
2970 19 : if (!is_dev_replace && !readonly && !dev->writeable) {
2971 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2972 : rcu_read_lock();
2973 0 : name = rcu_dereference(dev->name);
2974 0 : btrfs_err(fs_info, "scrub: device %s is not writable",
2975 : name->str);
2976 : rcu_read_unlock();
2977 0 : return -EROFS;
2978 : }
2979 :
2980 19 : mutex_lock(&fs_info->scrub_lock);
2981 19 : if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2982 0 : mutex_unlock(&fs_info->scrub_lock);
2983 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2984 0 : return -EIO;
2985 : }
2986 :
2987 19 : btrfs_dev_replace_lock(&fs_info->dev_replace);
2988 19 : if (dev->scrub_device ||
2989 11 : (!is_dev_replace &&
2990 11 : btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2991 0 : btrfs_dev_replace_unlock(&fs_info->dev_replace);
2992 0 : mutex_unlock(&fs_info->scrub_lock);
2993 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2994 0 : return -EINPROGRESS;
2995 : }
2996 19 : btrfs_dev_replace_unlock(&fs_info->dev_replace);
2997 :
2998 19 : ret = scrub_workers_get(fs_info, is_dev_replace);
2999 19 : if (ret) {
3000 0 : mutex_unlock(&fs_info->scrub_lock);
3001 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3002 0 : return ret;
3003 : }
3004 :
3005 19 : sctx = scrub_setup_ctx(dev, is_dev_replace);
3006 19 : if (IS_ERR(sctx)) {
3007 0 : mutex_unlock(&fs_info->scrub_lock);
3008 0 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3009 0 : scrub_workers_put(fs_info);
3010 0 : return PTR_ERR(sctx);
3011 : }
3012 19 : sctx->readonly = readonly;
3013 19 : dev->scrub_device = sctx;
3014 19 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3015 :
3016 : /*
3017 : * checking @scrub_pause_req here, we can avoid
3018 : * race between committing transaction and scrubbing.
3019 : */
3020 19 : __scrub_blocked_if_needed(fs_info);
3021 19 : atomic_inc(&fs_info->scrubs_running);
3022 19 : mutex_unlock(&fs_info->scrub_lock);
3023 :
3024 19 : if (!is_dev_replace) {
3025 : /*
3026 : * by holding device list mutex, we can
3027 : * kick off writing super in log tree sync.
3028 : */
3029 11 : mutex_lock(&fs_info->fs_devices->device_list_mutex);
3030 11 : ret = scrub_supers(sctx, dev);
3031 11 : mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3032 : }
3033 :
3034 19 : if (!ret)
3035 19 : ret = scrub_enumerate_chunks(sctx, dev, start, end,
3036 : is_dev_replace);
3037 :
3038 19 : wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3039 : atomic_dec(&fs_info->scrubs_running);
3040 19 : wake_up(&fs_info->scrub_pause_wait);
3041 :
3042 19 : wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3043 :
3044 19 : if (progress)
3045 19 : memcpy(progress, &sctx->stat, sizeof(*progress));
3046 :
3047 19 : mutex_lock(&fs_info->scrub_lock);
3048 19 : dev->scrub_device = NULL;
3049 19 : scrub_workers_put(fs_info);
3050 19 : mutex_unlock(&fs_info->scrub_lock);
3051 :
3052 19 : scrub_free_ctx(sctx);
3053 :
3054 19 : return ret;
3055 : }
3056 :
3057 2098 : void btrfs_scrub_pause(struct btrfs_root *root)
3058 : {
3059 2098 : struct btrfs_fs_info *fs_info = root->fs_info;
3060 :
3061 2098 : mutex_lock(&fs_info->scrub_lock);
3062 2098 : atomic_inc(&fs_info->scrub_pause_req);
3063 2100 : while (atomic_read(&fs_info->scrubs_paused) !=
3064 : atomic_read(&fs_info->scrubs_running)) {
3065 2 : mutex_unlock(&fs_info->scrub_lock);
3066 113 : wait_event(fs_info->scrub_pause_wait,
3067 : atomic_read(&fs_info->scrubs_paused) ==
3068 : atomic_read(&fs_info->scrubs_running));
3069 2 : mutex_lock(&fs_info->scrub_lock);
3070 : }
3071 2098 : mutex_unlock(&fs_info->scrub_lock);
3072 2098 : }
3073 :
3074 2098 : void btrfs_scrub_continue(struct btrfs_root *root)
3075 : {
3076 2098 : struct btrfs_fs_info *fs_info = root->fs_info;
3077 :
3078 2098 : atomic_dec(&fs_info->scrub_pause_req);
3079 2098 : wake_up(&fs_info->scrub_pause_wait);
3080 2098 : }
3081 :
3082 223 : int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3083 : {
3084 223 : mutex_lock(&fs_info->scrub_lock);
3085 223 : if (!atomic_read(&fs_info->scrubs_running)) {
3086 222 : mutex_unlock(&fs_info->scrub_lock);
3087 222 : return -ENOTCONN;
3088 : }
3089 :
3090 1 : atomic_inc(&fs_info->scrub_cancel_req);
3091 2 : while (atomic_read(&fs_info->scrubs_running)) {
3092 1 : mutex_unlock(&fs_info->scrub_lock);
3093 5 : wait_event(fs_info->scrub_pause_wait,
3094 : atomic_read(&fs_info->scrubs_running) == 0);
3095 1 : mutex_lock(&fs_info->scrub_lock);
3096 : }
3097 : atomic_dec(&fs_info->scrub_cancel_req);
3098 1 : mutex_unlock(&fs_info->scrub_lock);
3099 :
3100 1 : return 0;
3101 : }
3102 :
3103 0 : int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3104 : struct btrfs_device *dev)
3105 : {
3106 : struct scrub_ctx *sctx;
3107 :
3108 0 : mutex_lock(&fs_info->scrub_lock);
3109 0 : sctx = dev->scrub_device;
3110 0 : if (!sctx) {
3111 0 : mutex_unlock(&fs_info->scrub_lock);
3112 0 : return -ENOTCONN;
3113 : }
3114 0 : atomic_inc(&sctx->cancel_req);
3115 0 : while (dev->scrub_device) {
3116 0 : mutex_unlock(&fs_info->scrub_lock);
3117 0 : wait_event(fs_info->scrub_pause_wait,
3118 : dev->scrub_device == NULL);
3119 0 : mutex_lock(&fs_info->scrub_lock);
3120 : }
3121 0 : mutex_unlock(&fs_info->scrub_lock);
3122 :
3123 0 : return 0;
3124 : }
3125 :
3126 12 : int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3127 : struct btrfs_scrub_progress *progress)
3128 : {
3129 : struct btrfs_device *dev;
3130 : struct scrub_ctx *sctx = NULL;
3131 :
3132 12 : mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3133 12 : dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3134 12 : if (dev)
3135 12 : sctx = dev->scrub_device;
3136 12 : if (sctx)
3137 3 : memcpy(progress, &sctx->stat, sizeof(*progress));
3138 12 : mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3139 :
3140 12 : return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3141 : }
3142 :
3143 102237 : static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3144 : u64 extent_logical, u64 extent_len,
3145 : u64 *extent_physical,
3146 : struct btrfs_device **extent_dev,
3147 : int *extent_mirror_num)
3148 : {
3149 : u64 mapped_length;
3150 102237 : struct btrfs_bio *bbio = NULL;
3151 : int ret;
3152 :
3153 102237 : mapped_length = extent_len;
3154 102237 : ret = btrfs_map_block(fs_info, READ, extent_logical,
3155 : &mapped_length, &bbio, 0);
3156 204474 : if (ret || !bbio || mapped_length < extent_len ||
3157 102237 : !bbio->stripes[0].dev->bdev) {
3158 0 : kfree(bbio);
3159 102237 : return;
3160 : }
3161 :
3162 102237 : *extent_physical = bbio->stripes[0].physical;
3163 102237 : *extent_mirror_num = bbio->mirror_num;
3164 102237 : *extent_dev = bbio->stripes[0].dev;
3165 102237 : kfree(bbio);
3166 : }
3167 :
3168 19 : static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3169 : struct scrub_wr_ctx *wr_ctx,
3170 : struct btrfs_fs_info *fs_info,
3171 : struct btrfs_device *dev,
3172 : int is_dev_replace)
3173 : {
3174 19 : WARN_ON(wr_ctx->wr_curr_bio != NULL);
3175 :
3176 19 : mutex_init(&wr_ctx->wr_lock);
3177 19 : wr_ctx->wr_curr_bio = NULL;
3178 19 : if (!is_dev_replace)
3179 : return 0;
3180 :
3181 8 : WARN_ON(!dev->bdev);
3182 8 : wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3183 : bio_get_nr_vecs(dev->bdev));
3184 8 : wr_ctx->tgtdev = dev;
3185 : atomic_set(&wr_ctx->flush_all_writes, 0);
3186 : return 0;
3187 : }
3188 :
3189 19 : static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3190 : {
3191 19 : mutex_lock(&wr_ctx->wr_lock);
3192 19 : kfree(wr_ctx->wr_curr_bio);
3193 19 : wr_ctx->wr_curr_bio = NULL;
3194 19 : mutex_unlock(&wr_ctx->wr_lock);
3195 19 : }
3196 :
3197 1104 : static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3198 : int mirror_num, u64 physical_for_dev_replace)
3199 : {
3200 : struct scrub_copy_nocow_ctx *nocow_ctx;
3201 1104 : struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3202 :
3203 1104 : nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3204 1104 : if (!nocow_ctx) {
3205 : spin_lock(&sctx->stat_lock);
3206 0 : sctx->stat.malloc_errors++;
3207 : spin_unlock(&sctx->stat_lock);
3208 0 : return -ENOMEM;
3209 : }
3210 :
3211 1104 : scrub_pending_trans_workers_inc(sctx);
3212 :
3213 1104 : nocow_ctx->sctx = sctx;
3214 1104 : nocow_ctx->logical = logical;
3215 1104 : nocow_ctx->len = len;
3216 1104 : nocow_ctx->mirror_num = mirror_num;
3217 1104 : nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3218 1104 : btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3219 : copy_nocow_pages_worker, NULL, NULL);
3220 1104 : INIT_LIST_HEAD(&nocow_ctx->inodes);
3221 1104 : btrfs_queue_work(fs_info->scrub_nocow_workers,
3222 : &nocow_ctx->work);
3223 :
3224 1104 : return 0;
3225 : }
3226 :
3227 1011 : static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3228 : {
3229 : struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3230 : struct scrub_nocow_inode *nocow_inode;
3231 :
3232 1011 : nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3233 1011 : if (!nocow_inode)
3234 : return -ENOMEM;
3235 1011 : nocow_inode->inum = inum;
3236 1011 : nocow_inode->offset = offset;
3237 1011 : nocow_inode->root = root;
3238 1011 : list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3239 1011 : return 0;
3240 : }
3241 :
3242 : #define COPY_COMPLETE 1
3243 :
3244 1104 : static void copy_nocow_pages_worker(struct btrfs_work *work)
3245 : {
3246 1104 : struct scrub_copy_nocow_ctx *nocow_ctx =
3247 : container_of(work, struct scrub_copy_nocow_ctx, work);
3248 1104 : struct scrub_ctx *sctx = nocow_ctx->sctx;
3249 1104 : u64 logical = nocow_ctx->logical;
3250 1104 : u64 len = nocow_ctx->len;
3251 1104 : int mirror_num = nocow_ctx->mirror_num;
3252 1104 : u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3253 : int ret;
3254 : struct btrfs_trans_handle *trans = NULL;
3255 : struct btrfs_fs_info *fs_info;
3256 : struct btrfs_path *path;
3257 : struct btrfs_root *root;
3258 : int not_written = 0;
3259 :
3260 1104 : fs_info = sctx->dev_root->fs_info;
3261 1104 : root = fs_info->extent_root;
3262 :
3263 1104 : path = btrfs_alloc_path();
3264 1104 : if (!path) {
3265 : spin_lock(&sctx->stat_lock);
3266 0 : sctx->stat.malloc_errors++;
3267 : spin_unlock(&sctx->stat_lock);
3268 : not_written = 1;
3269 0 : goto out;
3270 : }
3271 :
3272 1104 : trans = btrfs_join_transaction(root);
3273 1104 : if (IS_ERR(trans)) {
3274 : not_written = 1;
3275 : goto out;
3276 : }
3277 :
3278 1104 : ret = iterate_inodes_from_logical(logical, fs_info, path,
3279 : record_inode_for_nocow, nocow_ctx);
3280 1104 : if (ret != 0 && ret != -ENOENT) {
3281 0 : btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3282 : "phys %llu, len %llu, mir %u, ret %d",
3283 : logical, physical_for_dev_replace, len, mirror_num,
3284 : ret);
3285 : not_written = 1;
3286 0 : goto out;
3287 : }
3288 :
3289 1104 : btrfs_end_transaction(trans, root);
3290 : trans = NULL;
3291 3312 : while (!list_empty(&nocow_ctx->inodes)) {
3292 : struct scrub_nocow_inode *entry;
3293 1011 : entry = list_first_entry(&nocow_ctx->inodes,
3294 : struct scrub_nocow_inode,
3295 : list);
3296 1011 : list_del_init(&entry->list);
3297 1011 : ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3298 : entry->root, nocow_ctx);
3299 1011 : kfree(entry);
3300 1011 : if (ret == COPY_COMPLETE) {
3301 : ret = 0;
3302 : break;
3303 0 : } else if (ret) {
3304 : break;
3305 : }
3306 : }
3307 : out:
3308 2208 : while (!list_empty(&nocow_ctx->inodes)) {
3309 : struct scrub_nocow_inode *entry;
3310 0 : entry = list_first_entry(&nocow_ctx->inodes,
3311 : struct scrub_nocow_inode,
3312 : list);
3313 0 : list_del_init(&entry->list);
3314 0 : kfree(entry);
3315 : }
3316 1104 : if (trans && !IS_ERR(trans))
3317 0 : btrfs_end_transaction(trans, root);
3318 1104 : if (not_written)
3319 0 : btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3320 : num_uncorrectable_read_errors);
3321 :
3322 1104 : btrfs_free_path(path);
3323 1104 : kfree(nocow_ctx);
3324 :
3325 1104 : scrub_pending_trans_workers_dec(sctx);
3326 1104 : }
3327 :
3328 1011 : static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3329 : struct scrub_copy_nocow_ctx *nocow_ctx)
3330 : {
3331 1011 : struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3332 : struct btrfs_key key;
3333 : struct inode *inode;
3334 : struct page *page;
3335 : struct btrfs_root *local_root;
3336 : struct btrfs_ordered_extent *ordered;
3337 : struct extent_map *em;
3338 1011 : struct extent_state *cached_state = NULL;
3339 : struct extent_io_tree *io_tree;
3340 : u64 physical_for_dev_replace;
3341 1011 : u64 len = nocow_ctx->len;
3342 1011 : u64 lockstart = offset, lockend = offset + len - 1;
3343 : unsigned long index;
3344 : int srcu_index;
3345 : int ret = 0;
3346 : int err = 0;
3347 :
3348 1011 : key.objectid = root;
3349 1011 : key.type = BTRFS_ROOT_ITEM_KEY;
3350 1011 : key.offset = (u64)-1;
3351 :
3352 1011 : srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3353 :
3354 : local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3355 1011 : if (IS_ERR(local_root)) {
3356 : srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3357 0 : return PTR_ERR(local_root);
3358 : }
3359 :
3360 1011 : key.type = BTRFS_INODE_ITEM_KEY;
3361 1011 : key.objectid = inum;
3362 1011 : key.offset = 0;
3363 1011 : inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3364 : srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3365 1011 : if (IS_ERR(inode))
3366 0 : return PTR_ERR(inode);
3367 :
3368 : /* Avoid truncate/dio/punch hole.. */
3369 1011 : mutex_lock(&inode->i_mutex);
3370 1011 : inode_dio_wait(inode);
3371 :
3372 1011 : physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3373 1011 : io_tree = &BTRFS_I(inode)->io_tree;
3374 :
3375 1011 : lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
3376 1011 : ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
3377 1011 : if (ordered) {
3378 0 : btrfs_put_ordered_extent(ordered);
3379 0 : goto out_unlock;
3380 : }
3381 :
3382 1011 : em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
3383 1011 : if (IS_ERR(em)) {
3384 0 : ret = PTR_ERR(em);
3385 0 : goto out_unlock;
3386 : }
3387 :
3388 : /*
3389 : * This extent does not actually cover the logical extent anymore,
3390 : * move on to the next inode.
3391 : */
3392 2022 : if (em->block_start > nocow_ctx->logical ||
3393 1011 : em->block_start + em->block_len < nocow_ctx->logical + len) {
3394 0 : free_extent_map(em);
3395 0 : goto out_unlock;
3396 : }
3397 1011 : free_extent_map(em);
3398 :
3399 3033 : while (len >= PAGE_CACHE_SIZE) {
3400 1011 : index = offset >> PAGE_CACHE_SHIFT;
3401 : again:
3402 1011 : page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3403 1011 : if (!page) {
3404 0 : btrfs_err(fs_info, "find_or_create_page() failed");
3405 : ret = -ENOMEM;
3406 0 : goto out;
3407 : }
3408 :
3409 1011 : if (PageUptodate(page)) {
3410 947 : if (PageDirty(page))
3411 : goto next_page;
3412 : } else {
3413 : ClearPageError(page);
3414 64 : err = extent_read_full_page_nolock(io_tree, page,
3415 : btrfs_get_extent,
3416 : nocow_ctx->mirror_num);
3417 64 : if (err) {
3418 : ret = err;
3419 : goto next_page;
3420 : }
3421 :
3422 64 : lock_page(page);
3423 : /*
3424 : * If the page has been remove from the page cache,
3425 : * the data on it is meaningless, because it may be
3426 : * old one, the new data may be written into the new
3427 : * page in the page cache.
3428 : */
3429 64 : if (page->mapping != inode->i_mapping) {
3430 0 : unlock_page(page);
3431 0 : page_cache_release(page);
3432 0 : goto again;
3433 : }
3434 64 : if (!PageUptodate(page)) {
3435 : ret = -EIO;
3436 : goto next_page;
3437 : }
3438 : }
3439 1011 : err = write_page_nocow(nocow_ctx->sctx,
3440 : physical_for_dev_replace, page);
3441 1011 : if (err)
3442 : ret = err;
3443 : next_page:
3444 1011 : unlock_page(page);
3445 1011 : page_cache_release(page);
3446 :
3447 1011 : if (ret)
3448 : break;
3449 :
3450 1011 : offset += PAGE_CACHE_SIZE;
3451 1011 : physical_for_dev_replace += PAGE_CACHE_SIZE;
3452 1011 : len -= PAGE_CACHE_SIZE;
3453 : }
3454 : ret = COPY_COMPLETE;
3455 : out_unlock:
3456 1011 : unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
3457 : GFP_NOFS);
3458 : out:
3459 1011 : mutex_unlock(&inode->i_mutex);
3460 1011 : iput(inode);
3461 1011 : return ret;
3462 : }
3463 :
3464 1011 : static int write_page_nocow(struct scrub_ctx *sctx,
3465 : u64 physical_for_dev_replace, struct page *page)
3466 : {
3467 : struct bio *bio;
3468 : struct btrfs_device *dev;
3469 : int ret;
3470 :
3471 1011 : dev = sctx->wr_ctx.tgtdev;
3472 1011 : if (!dev)
3473 : return -EIO;
3474 1011 : if (!dev->bdev) {
3475 0 : printk_ratelimited(KERN_WARNING
3476 : "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3477 : return -EIO;
3478 : }
3479 1011 : bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
3480 1011 : if (!bio) {
3481 : spin_lock(&sctx->stat_lock);
3482 0 : sctx->stat.malloc_errors++;
3483 : spin_unlock(&sctx->stat_lock);
3484 0 : return -ENOMEM;
3485 : }
3486 1011 : bio->bi_iter.bi_size = 0;
3487 1011 : bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
3488 1011 : bio->bi_bdev = dev->bdev;
3489 1011 : ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3490 1011 : if (ret != PAGE_CACHE_SIZE) {
3491 : leave_with_eio:
3492 0 : bio_put(bio);
3493 0 : btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3494 0 : return -EIO;
3495 : }
3496 :
3497 1011 : if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
3498 : goto leave_with_eio;
3499 :
3500 1011 : bio_put(bio);
3501 1011 : return 0;
3502 : }
|