LCOV - btrfstest.info - fs/btrfs/scrub.c

LCOV - code coverage report

Current view:	top level - fs/btrfs - scrub.c (source / functions)		Hit	Total	Coverage
Test:	btrfstest.info	Lines:	846	1400	60.4 %
Date:	2014-11-28	Functions:	47	59	79.7 %

          Line data    Source code

       1             : /*
       2             :  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
       3             :  *
       4             :  * This program is free software; you can redistribute it and/or
       5             :  * modify it under the terms of the GNU General Public
       6             :  * License v2 as published by the Free Software Foundation.
       7             :  *
       8             :  * This program is distributed in the hope that it will be useful,
       9             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      10             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      11             :  * General Public License for more details.
      12             :  *
      13             :  * You should have received a copy of the GNU General Public
      14             :  * License along with this program; if not, write to the
      15             :  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
      16             :  * Boston, MA 021110-1307, USA.
      17             :  */
      18             : 
      19             : #include <linux/blkdev.h>
      20             : #include <linux/ratelimit.h>
      21             : #include "ctree.h"
      22             : #include "volumes.h"
      23             : #include "disk-io.h"
      24             : #include "ordered-data.h"
      25             : #include "transaction.h"
      26             : #include "backref.h"
      27             : #include "extent_io.h"
      28             : #include "dev-replace.h"
      29             : #include "check-integrity.h"
      30             : #include "rcu-string.h"
      31             : #include "raid56.h"
      32             : 
      33             : /*
      34             :  * This is only the first step towards a full-features scrub. It reads all
      35             :  * extent and super block and verifies the checksums. In case a bad checksum
      36             :  * is found or the extent cannot be read, good data will be written back if
      37             :  * any can be found.
      38             :  *
      39             :  * Future enhancements:
      40             :  *  - In case an unrepairable extent is encountered, track which files are
      41             :  *    affected and report them
      42             :  *  - track and record media errors, throw out bad devices
      43             :  *  - add a mode to also read unallocated space
      44             :  */
      45             : 
      46             : struct scrub_block;
      47             : struct scrub_ctx;
      48             : 
      49             : /*
      50             :  * the following three values only influence the performance.
      51             :  * The last one configures the number of parallel and outstanding I/O
      52             :  * operations. The first two values configure an upper limit for the number
      53             :  * of (dynamically allocated) pages that are added to a bio.
      54             :  */
      55             : #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
      56             : #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
      57             : #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
      58             : 
      59             : /*
      60             :  * the following value times PAGE_SIZE needs to be large enough to match the
      61             :  * largest node/leaf/sector size that shall be supported.
      62             :  * Values larger than BTRFS_STRIPE_LEN are not supported.
      63             :  */
      64             : #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
      65             : 
      66             : struct scrub_page {
      67             :         struct scrub_block      *sblock;
      68             :         struct page             *page;
      69             :         struct btrfs_device     *dev;
      70             :         u64                     flags;  /* extent flags */
      71             :         u64                     generation;
      72             :         u64                     logical;
      73             :         u64                     physical;
      74             :         u64                     physical_for_dev_replace;
      75             :         atomic_t                ref_count;
      76             :         struct {
      77             :                 unsigned int    mirror_num:8;
      78             :                 unsigned int    have_csum:1;
      79             :                 unsigned int    io_error:1;
      80             :         };
      81             :         u8                      csum[BTRFS_CSUM_SIZE];
      82             : };
      83             : 
      84             : struct scrub_bio {
      85             :         int                     index;
      86             :         struct scrub_ctx        *sctx;
      87             :         struct btrfs_device     *dev;
      88             :         struct bio              *bio;
      89             :         int                     err;
      90             :         u64                     logical;
      91             :         u64                     physical;
      92             : #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
      93             :         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
      94             : #else
      95             :         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
      96             : #endif
      97             :         int                     page_count;
      98             :         int                     next_free;
      99             :         struct btrfs_work       work;
     100             : };
     101             : 
     102             : struct scrub_block {
     103             :         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
     104             :         int                     page_count;
     105             :         atomic_t                outstanding_pages;
     106             :         atomic_t                ref_count; /* free mem on transition to zero */
     107             :         struct scrub_ctx        *sctx;
     108             :         struct {
     109             :                 unsigned int    header_error:1;
     110             :                 unsigned int    checksum_error:1;
     111             :                 unsigned int    no_io_error_seen:1;
     112             :                 unsigned int    generation_error:1; /* also sets header_error */
     113             :         };
     114             : };
     115             : 
     116             : struct scrub_wr_ctx {
     117             :         struct scrub_bio *wr_curr_bio;
     118             :         struct btrfs_device *tgtdev;
     119             :         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
     120             :         atomic_t flush_all_writes;
     121             :         struct mutex wr_lock;
     122             : };
     123             : 
     124             : struct scrub_ctx {
     125             :         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
     126             :         struct btrfs_root       *dev_root;
     127             :         int                     first_free;
     128             :         int                     curr;
     129             :         atomic_t                bios_in_flight;
     130             :         atomic_t                workers_pending;
     131             :         spinlock_t              list_lock;
     132             :         wait_queue_head_t       list_wait;
     133             :         u16                     csum_size;
     134             :         struct list_head        csum_list;
     135             :         atomic_t                cancel_req;
     136             :         int                     readonly;
     137             :         int                     pages_per_rd_bio;
     138             :         u32                     sectorsize;
     139             :         u32                     nodesize;
     140             :         u32                     leafsize;
     141             : 
     142             :         int                     is_dev_replace;
     143             :         struct scrub_wr_ctx     wr_ctx;
     144             : 
     145             :         /*
     146             :          * statistics
     147             :          */
     148             :         struct btrfs_scrub_progress stat;
     149             :         spinlock_t              stat_lock;
     150             : };
     151             : 
     152             : struct scrub_fixup_nodatasum {
     153             :         struct scrub_ctx        *sctx;
     154             :         struct btrfs_device     *dev;
     155             :         u64                     logical;
     156             :         struct btrfs_root       *root;
     157             :         struct btrfs_work       work;
     158             :         int                     mirror_num;
     159             : };
     160             : 
     161             : struct scrub_nocow_inode {
     162             :         u64                     inum;
     163             :         u64                     offset;
     164             :         u64                     root;
     165             :         struct list_head        list;
     166             : };
     167             : 
     168             : struct scrub_copy_nocow_ctx {
     169             :         struct scrub_ctx        *sctx;
     170             :         u64                     logical;
     171             :         u64                     len;
     172             :         int                     mirror_num;
     173             :         u64                     physical_for_dev_replace;
     174             :         struct list_head        inodes;
     175             :         struct btrfs_work       work;
     176             : };
     177             : 
     178             : struct scrub_warning {
     179             :         struct btrfs_path       *path;
     180             :         u64                     extent_item_size;
     181             :         char                    *scratch_buf;
     182             :         char                    *msg_buf;
     183             :         const char              *errstr;
     184             :         sector_t                sector;
     185             :         u64                     logical;
     186             :         struct btrfs_device     *dev;
     187             :         int                     msg_bufsize;
     188             :         int                     scratch_bufsize;
     189             : };
     190             : 
     191             : 
     192             : static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
     193             : static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
     194             : static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
     195             : static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
     196             : static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
     197             : static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
     198             :                                      struct btrfs_fs_info *fs_info,
     199             :                                      struct scrub_block *original_sblock,
     200             :                                      u64 length, u64 logical,
     201             :                                      struct scrub_block *sblocks_for_recheck);
     202             : static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
     203             :                                 struct scrub_block *sblock, int is_metadata,
     204             :                                 int have_csum, u8 *csum, u64 generation,
     205             :                                 u16 csum_size);
     206             : static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
     207             :                                          struct scrub_block *sblock,
     208             :                                          int is_metadata, int have_csum,
     209             :                                          const u8 *csum, u64 generation,
     210             :                                          u16 csum_size);
     211             : static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
     212             :                                              struct scrub_block *sblock_good,
     213             :                                              int force_write);
     214             : static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
     215             :                                             struct scrub_block *sblock_good,
     216             :                                             int page_num, int force_write);
     217             : static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
     218             : static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
     219             :                                            int page_num);
     220             : static int scrub_checksum_data(struct scrub_block *sblock);
     221             : static int scrub_checksum_tree_block(struct scrub_block *sblock);
     222             : static int scrub_checksum_super(struct scrub_block *sblock);
     223             : static void scrub_block_get(struct scrub_block *sblock);
     224             : static void scrub_block_put(struct scrub_block *sblock);
     225             : static void scrub_page_get(struct scrub_page *spage);
     226             : static void scrub_page_put(struct scrub_page *spage);
     227             : static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
     228             :                                     struct scrub_page *spage);
     229             : static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
     230             :                        u64 physical, struct btrfs_device *dev, u64 flags,
     231             :                        u64 gen, int mirror_num, u8 *csum, int force,
     232             :                        u64 physical_for_dev_replace);
     233             : static void scrub_bio_end_io(struct bio *bio, int err);
     234             : static void scrub_bio_end_io_worker(struct btrfs_work *work);
     235             : static void scrub_block_complete(struct scrub_block *sblock);
     236             : static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
     237             :                                u64 extent_logical, u64 extent_len,
     238             :                                u64 *extent_physical,
     239             :                                struct btrfs_device **extent_dev,
     240             :                                int *extent_mirror_num);
     241             : static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
     242             :                               struct scrub_wr_ctx *wr_ctx,
     243             :                               struct btrfs_fs_info *fs_info,
     244             :                               struct btrfs_device *dev,
     245             :                               int is_dev_replace);
     246             : static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
     247             : static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
     248             :                                     struct scrub_page *spage);
     249             : static void scrub_wr_submit(struct scrub_ctx *sctx);
     250             : static void scrub_wr_bio_end_io(struct bio *bio, int err);
     251             : static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
     252             : static int write_page_nocow(struct scrub_ctx *sctx,
     253             :                             u64 physical_for_dev_replace, struct page *page);
     254             : static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
     255             :                                       struct scrub_copy_nocow_ctx *ctx);
     256             : static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
     257             :                             int mirror_num, u64 physical_for_dev_replace);
     258             : static void copy_nocow_pages_worker(struct btrfs_work *work);
     259             : static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
     260             : static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
     261             : 
     262             : 
     263             : static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
     264             : {
     265      178996 :         atomic_inc(&sctx->bios_in_flight);
     266             : }
     267             : 
     268      178988 : static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
     269             : {
     270      178988 :         atomic_dec(&sctx->bios_in_flight);
     271      179001 :         wake_up(&sctx->list_wait);
     272      178985 : }
     273             : 
     274         245 : static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
     275             : {
     276         492 :         while (atomic_read(&fs_info->scrub_pause_req)) {
     277           2 :                 mutex_unlock(&fs_info->scrub_lock);
     278           8 :                 wait_event(fs_info->scrub_pause_wait,
     279             :                    atomic_read(&fs_info->scrub_pause_req) == 0);
     280           2 :                 mutex_lock(&fs_info->scrub_lock);
     281             :         }
     282         245 : }
     283             : 
     284         113 : static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
     285             : {
     286         113 :         atomic_inc(&fs_info->scrubs_paused);
     287         113 :         wake_up(&fs_info->scrub_pause_wait);
     288             : 
     289         113 :         mutex_lock(&fs_info->scrub_lock);
     290         113 :         __scrub_blocked_if_needed(fs_info);
     291             :         atomic_dec(&fs_info->scrubs_paused);
     292         113 :         mutex_unlock(&fs_info->scrub_lock);
     293             : 
     294         113 :         wake_up(&fs_info->scrub_pause_wait);
     295         113 : }
     296             : 
     297             : /*
     298             :  * used for workers that require transaction commits (i.e., for the
     299             :  * NOCOW case)
     300             :  */
     301        1104 : static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
     302             : {
     303        1104 :         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
     304             : 
     305             :         /*
     306             :          * increment scrubs_running to prevent cancel requests from
     307             :          * completing as long as a worker is running. we must also
     308             :          * increment scrubs_paused to prevent deadlocking on pause
     309             :          * requests used for transactions commits (as the worker uses a
     310             :          * transaction context). it is safe to regard the worker
     311             :          * as paused for all matters practical. effectively, we only
     312             :          * avoid cancellation requests from completing.
     313             :          */
     314        1104 :         mutex_lock(&fs_info->scrub_lock);
     315        1104 :         atomic_inc(&fs_info->scrubs_running);
     316        1104 :         atomic_inc(&fs_info->scrubs_paused);
     317        1104 :         mutex_unlock(&fs_info->scrub_lock);
     318             : 
     319             :         /*
     320             :          * check if @scrubs_running=@scrubs_paused condition
     321             :          * inside wait_event() is not an atomic operation.
     322             :          * which means we may inc/dec @scrub_running/paused
     323             :          * at any time. Let's wake up @scrub_pause_wait as
     324             :          * much as we can to let commit transaction blocked less.
     325             :          */
     326        1104 :         wake_up(&fs_info->scrub_pause_wait);
     327             : 
     328        1104 :         atomic_inc(&sctx->workers_pending);
     329        1104 : }
     330             : 
     331             : /* used for workers that require transaction commits */
     332        1104 : static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
     333             : {
     334        1104 :         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
     335             : 
     336             :         /*
     337             :          * see scrub_pending_trans_workers_inc() why we're pretending
     338             :          * to be paused in the scrub counters
     339             :          */
     340        1104 :         mutex_lock(&fs_info->scrub_lock);
     341        1104 :         atomic_dec(&fs_info->scrubs_running);
     342        1104 :         atomic_dec(&fs_info->scrubs_paused);
     343        1104 :         mutex_unlock(&fs_info->scrub_lock);
     344        1104 :         atomic_dec(&sctx->workers_pending);
     345        1104 :         wake_up(&fs_info->scrub_pause_wait);
     346        1104 :         wake_up(&sctx->list_wait);
     347        1104 : }
     348             : 
     349      290492 : static void scrub_free_csums(struct scrub_ctx *sctx)
     350             : {
     351      939356 :         while (!list_empty(&sctx->csum_list)) {
     352             :                 struct btrfs_ordered_sum *sum;
     353       33940 :                 sum = list_first_entry(&sctx->csum_list,
     354             :                                        struct btrfs_ordered_sum, list);
     355       33940 :                 list_del(&sum->list);
     356       33940 :                 kfree(sum);
     357             :         }
     358      290492 : }
     359             : 
     360          19 : static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
     361             : {
     362             :         int i;
     363             : 
     364          19 :         if (!sctx)
     365          19 :                 return;
     366             : 
     367          19 :         scrub_free_wr_ctx(&sctx->wr_ctx);
     368             : 
     369             :         /* this can happen when scrub is cancelled */
     370          19 :         if (sctx->curr != -1) {
     371           0 :                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
     372             : 
     373           0 :                 for (i = 0; i < sbio->page_count; i++) {
     374           0 :                         WARN_ON(!sbio->pagev[i]->page);
     375           0 :                         scrub_block_put(sbio->pagev[i]->sblock);
     376             :                 }
     377           0 :                 bio_put(sbio->bio);
     378             :         }
     379             : 
     380        1216 :         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
     381        1216 :                 struct scrub_bio *sbio = sctx->bios[i];
     382             : 
     383        1216 :                 if (!sbio)
     384             :                         break;
     385        1216 :                 kfree(sbio);
     386             :         }
     387             : 
     388          19 :         scrub_free_csums(sctx);
     389          19 :         kfree(sctx);
     390             : }
     391             : 
     392             : static noinline_for_stack
     393          19 : struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
     394             : {
     395             :         struct scrub_ctx *sctx;
     396             :         int             i;
     397          19 :         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
     398             :         int pages_per_rd_bio;
     399             :         int ret;
     400             : 
     401             :         /*
     402             :          * the setting of pages_per_rd_bio is correct for scrub but might
     403             :          * be wrong for the dev_replace code where we might read from
     404             :          * different devices in the initial huge bios. However, that
     405             :          * code is able to correctly handle the case when adding a page
     406             :          * to a bio fails.
     407             :          */
     408          19 :         if (dev->bdev)
     409          19 :                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
     410             :                                          bio_get_nr_vecs(dev->bdev));
     411             :         else
     412             :                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
     413          19 :         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
     414          19 :         if (!sctx)
     415             :                 goto nomem;
     416          19 :         sctx->is_dev_replace = is_dev_replace;
     417          19 :         sctx->pages_per_rd_bio = pages_per_rd_bio;
     418          19 :         sctx->curr = -1;
     419          19 :         sctx->dev_root = dev->dev_root;
     420        1216 :         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
     421             :                 struct scrub_bio *sbio;
     422             : 
     423        1216 :                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
     424        1216 :                 if (!sbio)
     425             :                         goto nomem;
     426        1216 :                 sctx->bios[i] = sbio;
     427             : 
     428        1216 :                 sbio->index = i;
     429        1216 :                 sbio->sctx = sctx;
     430        1216 :                 sbio->page_count = 0;
     431        1216 :                 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
     432             :                                 scrub_bio_end_io_worker, NULL, NULL);
     433             : 
     434        1216 :                 if (i != SCRUB_BIOS_PER_SCTX - 1)
     435        1197 :                         sctx->bios[i]->next_free = i + 1;
     436             :                 else
     437          19 :                         sctx->bios[i]->next_free = -1;
     438             :         }
     439          19 :         sctx->first_free = 0;
     440          19 :         sctx->nodesize = dev->dev_root->nodesize;
     441          19 :         sctx->leafsize = dev->dev_root->leafsize;
     442          19 :         sctx->sectorsize = dev->dev_root->sectorsize;
     443             :         atomic_set(&sctx->bios_in_flight, 0);
     444             :         atomic_set(&sctx->workers_pending, 0);
     445             :         atomic_set(&sctx->cancel_req, 0);
     446          38 :         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
     447          19 :         INIT_LIST_HEAD(&sctx->csum_list);
     448             : 
     449          19 :         spin_lock_init(&sctx->list_lock);
     450          19 :         spin_lock_init(&sctx->stat_lock);
     451          19 :         init_waitqueue_head(&sctx->list_wait);
     452             : 
     453          19 :         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
     454             :                                  fs_info->dev_replace.tgtdev, is_dev_replace);
     455          19 :         if (ret) {
     456           0 :                 scrub_free_ctx(sctx);
     457           0 :                 return ERR_PTR(ret);
     458             :         }
     459             :         return sctx;
     460             : 
     461             : nomem:
     462           0 :         scrub_free_ctx(sctx);
     463             :         return ERR_PTR(-ENOMEM);
     464             : }
     465             : 
     466           0 : static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
     467             :                                      void *warn_ctx)
     468             : {
     469             :         u64 isize;
     470             :         u32 nlink;
     471             :         int ret;
     472             :         int i;
     473             :         struct extent_buffer *eb;
     474             :         struct btrfs_inode_item *inode_item;
     475             :         struct scrub_warning *swarn = warn_ctx;
     476           0 :         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
     477             :         struct inode_fs_paths *ipath = NULL;
     478             :         struct btrfs_root *local_root;
     479             :         struct btrfs_key root_key;
     480             : 
     481           0 :         root_key.objectid = root;
     482           0 :         root_key.type = BTRFS_ROOT_ITEM_KEY;
     483           0 :         root_key.offset = (u64)-1;
     484             :         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
     485           0 :         if (IS_ERR(local_root)) {
     486           0 :                 ret = PTR_ERR(local_root);
     487           0 :                 goto err;
     488             :         }
     489             : 
     490           0 :         ret = inode_item_info(inum, 0, local_root, swarn->path);
     491           0 :         if (ret) {
     492           0 :                 btrfs_release_path(swarn->path);
     493           0 :                 goto err;
     494             :         }
     495             : 
     496           0 :         eb = swarn->path->nodes[0];
     497           0 :         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
     498             :                                         struct btrfs_inode_item);
     499             :         isize = btrfs_inode_size(eb, inode_item);
     500             :         nlink = btrfs_inode_nlink(eb, inode_item);
     501           0 :         btrfs_release_path(swarn->path);
     502             : 
     503           0 :         ipath = init_ipath(4096, local_root, swarn->path);
     504           0 :         if (IS_ERR(ipath)) {
     505           0 :                 ret = PTR_ERR(ipath);
     506             :                 ipath = NULL;
     507           0 :                 goto err;
     508             :         }
     509           0 :         ret = paths_from_inode(inum, ipath);
     510             : 
     511           0 :         if (ret < 0)
     512             :                 goto err;
     513             : 
     514             :         /*
     515             :          * we deliberately ignore the bit ipath might have been too small to
     516             :          * hold all of the paths here
     517             :          */
     518           0 :         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
     519           0 :                 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
     520             :                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
     521             :                         "length %llu, links %u (path: %s)\n", swarn->errstr,
     522             :                         swarn->logical, rcu_str_deref(swarn->dev->name),
     523             :                         (unsigned long long)swarn->sector, root, inum, offset,
     524             :                         min(isize - offset, (u64)PAGE_SIZE), nlink,
     525             :                         (char *)(unsigned long)ipath->fspath->val[i]);
     526             : 
     527           0 :         free_ipath(ipath);
     528           0 :         return 0;
     529             : 
     530             : err:
     531           0 :         printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
     532             :                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
     533             :                 "resolving failed with ret=%d\n", swarn->errstr,
     534             :                 swarn->logical, rcu_str_deref(swarn->dev->name),
     535             :                 (unsigned long long)swarn->sector, root, inum, offset, ret);
     536             : 
     537           0 :         free_ipath(ipath);
     538           0 :         return 0;
     539             : }
     540             : 
     541           0 : static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
     542             : {
     543             :         struct btrfs_device *dev;
     544             :         struct btrfs_fs_info *fs_info;
     545             :         struct btrfs_path *path;
     546             :         struct btrfs_key found_key;
     547             :         struct extent_buffer *eb;
     548             :         struct btrfs_extent_item *ei;
     549             :         struct scrub_warning swarn;
     550           0 :         unsigned long ptr = 0;
     551             :         u64 extent_item_pos;
     552           0 :         u64 flags = 0;
     553             :         u64 ref_root;
     554             :         u32 item_size;
     555             :         u8 ref_level;
     556             :         const int bufsize = 4096;
     557             :         int ret;
     558             : 
     559           0 :         WARN_ON(sblock->page_count < 1);
     560           0 :         dev = sblock->pagev[0]->dev;
     561           0 :         fs_info = sblock->sctx->dev_root->fs_info;
     562             : 
     563           0 :         path = btrfs_alloc_path();
     564             : 
     565           0 :         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
     566           0 :         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
     567           0 :         swarn.sector = (sblock->pagev[0]->physical) >> 9;
     568           0 :         swarn.logical = sblock->pagev[0]->logical;
     569           0 :         swarn.errstr = errstr;
     570           0 :         swarn.dev = NULL;
     571           0 :         swarn.msg_bufsize = bufsize;
     572           0 :         swarn.scratch_bufsize = bufsize;
     573             : 
     574           0 :         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
     575             :                 goto out;
     576             : 
     577           0 :         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
     578             :                                   &flags);
     579           0 :         if (ret < 0)
     580             :                 goto out;
     581             : 
     582           0 :         extent_item_pos = swarn.logical - found_key.objectid;
     583           0 :         swarn.extent_item_size = found_key.offset;
     584             : 
     585           0 :         eb = path->nodes[0];
     586           0 :         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
     587           0 :         item_size = btrfs_item_size_nr(eb, path->slots[0]);
     588             : 
     589           0 :         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
     590             :                 do {
     591           0 :                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
     592             :                                                       item_size, &ref_root,
     593             :                                                       &ref_level);
     594           0 :                         printk_in_rcu(KERN_WARNING
     595             :                                 "BTRFS: %s at logical %llu on dev %s, "
     596             :                                 "sector %llu: metadata %s (level %d) in tree "
     597             :                                 "%llu\n", errstr, swarn.logical,
     598             :                                 rcu_str_deref(dev->name),
     599             :                                 (unsigned long long)swarn.sector,
     600             :                                 ref_level ? "node" : "leaf",
     601             :                                 ret < 0 ? -1 : ref_level,
     602             :                                 ret < 0 ? -1 : ref_root);
     603           0 :                 } while (ret != 1);
     604           0 :                 btrfs_release_path(path);
     605             :         } else {
     606           0 :                 btrfs_release_path(path);
     607           0 :                 swarn.path = path;
     608           0 :                 swarn.dev = dev;
     609           0 :                 iterate_extent_inodes(fs_info, found_key.objectid,
     610             :                                         extent_item_pos, 1,
     611             :                                         scrub_print_warning_inode, &swarn);
     612             :         }
     613             : 
     614             : out:
     615           0 :         btrfs_free_path(path);
     616           0 :         kfree(swarn.scratch_buf);
     617           0 :         kfree(swarn.msg_buf);
     618           0 : }
     619             : 
     620           0 : static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
     621             : {
     622             :         struct page *page = NULL;
     623             :         unsigned long index;
     624             :         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
     625             :         int ret;
     626             :         int corrected = 0;
     627             :         struct btrfs_key key;
     628             :         struct inode *inode = NULL;
     629             :         struct btrfs_fs_info *fs_info;
     630           0 :         u64 end = offset + PAGE_SIZE - 1;
     631             :         struct btrfs_root *local_root;
     632             :         int srcu_index;
     633             : 
     634           0 :         key.objectid = root;
     635           0 :         key.type = BTRFS_ROOT_ITEM_KEY;
     636           0 :         key.offset = (u64)-1;
     637             : 
     638           0 :         fs_info = fixup->root->fs_info;
     639           0 :         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
     640             : 
     641             :         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
     642           0 :         if (IS_ERR(local_root)) {
     643             :                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
     644           0 :                 return PTR_ERR(local_root);
     645             :         }
     646             : 
     647           0 :         key.type = BTRFS_INODE_ITEM_KEY;
     648           0 :         key.objectid = inum;
     649           0 :         key.offset = 0;
     650           0 :         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
     651             :         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
     652           0 :         if (IS_ERR(inode))
     653           0 :                 return PTR_ERR(inode);
     654             : 
     655           0 :         index = offset >> PAGE_CACHE_SHIFT;
     656             : 
     657           0 :         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
     658           0 :         if (!page) {
     659             :                 ret = -ENOMEM;
     660             :                 goto out;
     661             :         }
     662             : 
     663           0 :         if (PageUptodate(page)) {
     664           0 :                 if (PageDirty(page)) {
     665             :                         /*
     666             :                          * we need to write the data to the defect sector. the
     667             :                          * data that was in that sector is not in memory,
     668             :                          * because the page was modified. we must not write the
     669             :                          * modified page to that sector.
     670             :                          *
     671             :                          * TODO: what could be done here: wait for the delalloc
     672             :                          *       runner to write out that page (might involve
     673             :                          *       COW) and see whether the sector is still
     674             :                          *       referenced afterwards.
     675             :                          *
     676             :                          * For the meantime, we'll treat this error
     677             :                          * incorrectable, although there is a chance that a
     678             :                          * later scrub will find the bad sector again and that
     679             :                          * there's no dirty page in memory, then.
     680             :                          */
     681             :                         ret = -EIO;
     682             :                         goto out;
     683             :                 }
     684           0 :                 fs_info = BTRFS_I(inode)->root->fs_info;
     685           0 :                 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
     686             :                                         fixup->logical, page,
     687             :                                         fixup->mirror_num);
     688           0 :                 unlock_page(page);
     689           0 :                 corrected = !ret;
     690             :         } else {
     691             :                 /*
     692             :                  * we need to get good data first. the general readpage path
     693             :                  * will call repair_io_failure for us, we just have to make
     694             :                  * sure we read the bad mirror.
     695             :                  */
     696           0 :                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
     697             :                                         EXTENT_DAMAGED, GFP_NOFS);
     698           0 :                 if (ret) {
     699             :                         /* set_extent_bits should give proper error */
     700           0 :                         WARN_ON(ret > 0);
     701           0 :                         if (ret > 0)
     702             :                                 ret = -EFAULT;
     703             :                         goto out;
     704             :                 }
     705             : 
     706           0 :                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
     707             :                                                 btrfs_get_extent,
     708             :                                                 fixup->mirror_num);
     709             :                 wait_on_page_locked(page);
     710             : 
     711           0 :                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
     712             :                                                 end, EXTENT_DAMAGED, 0, NULL);
     713           0 :                 if (!corrected)
     714           0 :                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
     715             :                                                 EXTENT_DAMAGED, GFP_NOFS);
     716             :         }
     717             : 
     718             : out:
     719           0 :         if (page)
     720           0 :                 put_page(page);
     721             : 
     722           0 :         iput(inode);
     723             : 
     724           0 :         if (ret < 0)
     725             :                 return ret;
     726             : 
     727           0 :         if (ret == 0 && corrected) {
     728             :                 /*
     729             :                  * we only need to call readpage for one of the inodes belonging
     730             :                  * to this extent. so make iterate_extent_inodes stop
     731             :                  */
     732             :                 return 1;
     733             :         }
     734             : 
     735           0 :         return -EIO;
     736             : }
     737             : 
     738           0 : static void scrub_fixup_nodatasum(struct btrfs_work *work)
     739             : {
     740             :         int ret;
     741             :         struct scrub_fixup_nodatasum *fixup;
     742             :         struct scrub_ctx *sctx;
     743             :         struct btrfs_trans_handle *trans = NULL;
     744             :         struct btrfs_path *path;
     745             :         int uncorrectable = 0;
     746             : 
     747           0 :         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
     748           0 :         sctx = fixup->sctx;
     749             : 
     750           0 :         path = btrfs_alloc_path();
     751           0 :         if (!path) {
     752             :                 spin_lock(&sctx->stat_lock);
     753           0 :                 ++sctx->stat.malloc_errors;
     754             :                 spin_unlock(&sctx->stat_lock);
     755             :                 uncorrectable = 1;
     756           0 :                 goto out;
     757             :         }
     758             : 
     759           0 :         trans = btrfs_join_transaction(fixup->root);
     760           0 :         if (IS_ERR(trans)) {
     761             :                 uncorrectable = 1;
     762             :                 goto out;
     763             :         }
     764             : 
     765             :         /*
     766             :          * the idea is to trigger a regular read through the standard path. we
     767             :          * read a page from the (failed) logical address by specifying the
     768             :          * corresponding copynum of the failed sector. thus, that readpage is
     769             :          * expected to fail.
     770             :          * that is the point where on-the-fly error correction will kick in
     771             :          * (once it's finished) and rewrite the failed sector if a good copy
     772             :          * can be found.
     773             :          */
     774           0 :         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
     775             :                                                 path, scrub_fixup_readpage,
     776             :                                                 fixup);
     777           0 :         if (ret < 0) {
     778             :                 uncorrectable = 1;
     779             :                 goto out;
     780             :         }
     781           0 :         WARN_ON(ret != 1);
     782             : 
     783             :         spin_lock(&sctx->stat_lock);
     784           0 :         ++sctx->stat.corrected_errors;
     785             :         spin_unlock(&sctx->stat_lock);
     786             : 
     787             : out:
     788           0 :         if (trans && !IS_ERR(trans))
     789           0 :                 btrfs_end_transaction(trans, fixup->root);
     790           0 :         if (uncorrectable) {
     791             :                 spin_lock(&sctx->stat_lock);
     792           0 :                 ++sctx->stat.uncorrectable_errors;
     793             :                 spin_unlock(&sctx->stat_lock);
     794           0 :                 btrfs_dev_replace_stats_inc(
     795           0 :                         &sctx->dev_root->fs_info->dev_replace.
     796             :                         num_uncorrectable_read_errors);
     797           0 :                 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
     798             :                     "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
     799             :                         fixup->logical, rcu_str_deref(fixup->dev->name));
     800             :         }
     801             : 
     802           0 :         btrfs_free_path(path);
     803           0 :         kfree(fixup);
     804             : 
     805           0 :         scrub_pending_trans_workers_dec(sctx);
     806           0 : }
     807             : 
     808             : /*
     809             :  * scrub_handle_errored_block gets called when either verification of the
     810             :  * pages failed or the bio failed to read, e.g. with EIO. In the latter
     811             :  * case, this function handles all pages in the bio, even though only one
     812             :  * may be bad.
     813             :  * The goal of this function is to repair the errored block by using the
     814             :  * contents of one of the mirrors.
     815             :  */
     816           0 : static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
     817             : {
     818           0 :         struct scrub_ctx *sctx = sblock_to_check->sctx;
     819             :         struct btrfs_device *dev;
     820             :         struct btrfs_fs_info *fs_info;
     821             :         u64 length;
     822             :         u64 logical;
     823             :         u64 generation;
     824             :         unsigned int failed_mirror_index;
     825             :         unsigned int is_metadata;
     826             :         unsigned int have_csum;
     827             :         u8 *csum;
     828             :         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
     829             :         struct scrub_block *sblock_bad;
     830             :         int ret;
     831             :         int mirror_index;
     832             :         int page_num;
     833             :         int success;
     834             :         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
     835             :                                       DEFAULT_RATELIMIT_BURST);
     836             : 
     837           0 :         BUG_ON(sblock_to_check->page_count < 1);
     838           0 :         fs_info = sctx->dev_root->fs_info;
     839           0 :         if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
     840             :                 /*
     841             :                  * if we find an error in a super block, we just report it.
     842             :                  * They will get written with the next transaction commit
     843             :                  * anyway
     844             :                  */
     845             :                 spin_lock(&sctx->stat_lock);
     846           0 :                 ++sctx->stat.super_errors;
     847             :                 spin_unlock(&sctx->stat_lock);
     848           0 :                 return 0;
     849             :         }
     850           0 :         length = sblock_to_check->page_count * PAGE_SIZE;
     851           0 :         logical = sblock_to_check->pagev[0]->logical;
     852           0 :         generation = sblock_to_check->pagev[0]->generation;
     853           0 :         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
     854           0 :         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
     855           0 :         is_metadata = !(sblock_to_check->pagev[0]->flags &
     856             :                         BTRFS_EXTENT_FLAG_DATA);
     857           0 :         have_csum = sblock_to_check->pagev[0]->have_csum;
     858           0 :         csum = sblock_to_check->pagev[0]->csum;
     859           0 :         dev = sblock_to_check->pagev[0]->dev;
     860             : 
     861           0 :         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
     862             :                 sblocks_for_recheck = NULL;
     863             :                 goto nodatasum_case;
     864             :         }
     865             : 
     866             :         /*
     867             :          * read all mirrors one after the other. This includes to
     868             :          * re-read the extent or metadata block that failed (that was
     869             :          * the cause that this fixup code is called) another time,
     870             :          * page by page this time in order to know which pages
     871             :          * caused I/O errors and which ones are good (for all mirrors).
     872             :          * It is the goal to handle the situation when more than one
     873             :          * mirror contains I/O errors, but the errors do not
     874             :          * overlap, i.e. the data can be repaired by selecting the
     875             :          * pages from those mirrors without I/O error on the
     876             :          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
     877             :          * would be that mirror #1 has an I/O error on the first page,
     878             :          * the second page is good, and mirror #2 has an I/O error on
     879             :          * the second page, but the first page is good.
     880             :          * Then the first page of the first mirror can be repaired by
     881             :          * taking the first page of the second mirror, and the
     882             :          * second page of the second mirror can be repaired by
     883             :          * copying the contents of the 2nd page of the 1st mirror.
     884             :          * One more note: if the pages of one mirror contain I/O
     885             :          * errors, the checksum cannot be verified. In order to get
     886             :          * the best data for repairing, the first attempt is to find
     887             :          * a mirror without I/O errors and with a validated checksum.
     888             :          * Only if this is not possible, the pages are picked from
     889             :          * mirrors with I/O errors without considering the checksum.
     890             :          * If the latter is the case, at the end, the checksum of the
     891             :          * repaired area is verified in order to correctly maintain
     892             :          * the statistics.
     893             :          */
     894             : 
     895           0 :         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
     896             :                                      sizeof(*sblocks_for_recheck),
     897             :                                      GFP_NOFS);
     898           0 :         if (!sblocks_for_recheck) {
     899             :                 spin_lock(&sctx->stat_lock);
     900           0 :                 sctx->stat.malloc_errors++;
     901           0 :                 sctx->stat.read_errors++;
     902           0 :                 sctx->stat.uncorrectable_errors++;
     903             :                 spin_unlock(&sctx->stat_lock);
     904           0 :                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
     905           0 :                 goto out;
     906             :         }
     907             : 
     908             :         /* setup the context, map the logical blocks and alloc the pages */
     909           0 :         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
     910             :                                         logical, sblocks_for_recheck);
     911           0 :         if (ret) {
     912             :                 spin_lock(&sctx->stat_lock);
     913           0 :                 sctx->stat.read_errors++;
     914           0 :                 sctx->stat.uncorrectable_errors++;
     915             :                 spin_unlock(&sctx->stat_lock);
     916           0 :                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
     917           0 :                 goto out;
     918             :         }
     919           0 :         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
     920           0 :         sblock_bad = sblocks_for_recheck + failed_mirror_index;
     921             : 
     922             :         /* build and submit the bios for the failed mirror, check checksums */
     923           0 :         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
     924           0 :                             csum, generation, sctx->csum_size);
     925             : 
     926           0 :         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
     927             :             sblock_bad->no_io_error_seen) {
     928             :                 /*
     929             :                  * the error disappeared after reading page by page, or
     930             :                  * the area was part of a huge bio and other parts of the
     931             :                  * bio caused I/O errors, or the block layer merged several
     932             :                  * read requests into one and the error is caused by a
     933             :                  * different bio (usually one of the two latter cases is
     934             :                  * the cause)
     935             :                  */
     936             :                 spin_lock(&sctx->stat_lock);
     937           0 :                 sctx->stat.unverified_errors++;
     938             :                 spin_unlock(&sctx->stat_lock);
     939             : 
     940           0 :                 if (sctx->is_dev_replace)
     941           0 :                         scrub_write_block_to_dev_replace(sblock_bad);
     942             :                 goto out;
     943             :         }
     944             : 
     945           0 :         if (!sblock_bad->no_io_error_seen) {
     946             :                 spin_lock(&sctx->stat_lock);
     947           0 :                 sctx->stat.read_errors++;
     948             :                 spin_unlock(&sctx->stat_lock);
     949           0 :                 if (__ratelimit(&_rs))
     950           0 :                         scrub_print_warning("i/o error", sblock_to_check);
     951           0 :                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
     952           0 :         } else if (sblock_bad->checksum_error) {
     953             :                 spin_lock(&sctx->stat_lock);
     954           0 :                 sctx->stat.csum_errors++;
     955             :                 spin_unlock(&sctx->stat_lock);
     956           0 :                 if (__ratelimit(&_rs))
     957           0 :                         scrub_print_warning("checksum error", sblock_to_check);
     958           0 :                 btrfs_dev_stat_inc_and_print(dev,
     959             :                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
     960           0 :         } else if (sblock_bad->header_error) {
     961             :                 spin_lock(&sctx->stat_lock);
     962           0 :                 sctx->stat.verify_errors++;
     963             :                 spin_unlock(&sctx->stat_lock);
     964           0 :                 if (__ratelimit(&_rs))
     965           0 :                         scrub_print_warning("checksum/header error",
     966             :                                             sblock_to_check);
     967           0 :                 if (sblock_bad->generation_error)
     968           0 :                         btrfs_dev_stat_inc_and_print(dev,
     969             :                                 BTRFS_DEV_STAT_GENERATION_ERRS);
     970             :                 else
     971           0 :                         btrfs_dev_stat_inc_and_print(dev,
     972             :                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
     973             :         }
     974             : 
     975           0 :         if (sctx->readonly) {
     976             :                 ASSERT(!sctx->is_dev_replace);
     977             :                 goto out;
     978             :         }
     979             : 
     980           0 :         if (!is_metadata && !have_csum) {
     981             :                 struct scrub_fixup_nodatasum *fixup_nodatasum;
     982             : 
     983             : nodatasum_case:
     984           0 :                 WARN_ON(sctx->is_dev_replace);
     985             : 
     986             :                 /*
     987             :                  * !is_metadata and !have_csum, this means that the data
     988             :                  * might not be COW'ed, that it might be modified
     989             :                  * concurrently. The general strategy to work on the
     990             :                  * commit root does not help in the case when COW is not
     991             :                  * used.
     992             :                  */
     993           0 :                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
     994           0 :                 if (!fixup_nodatasum)
     995             :                         goto did_not_correct_error;
     996           0 :                 fixup_nodatasum->sctx = sctx;
     997           0 :                 fixup_nodatasum->dev = dev;
     998           0 :                 fixup_nodatasum->logical = logical;
     999           0 :                 fixup_nodatasum->root = fs_info->extent_root;
    1000           0 :                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
    1001           0 :                 scrub_pending_trans_workers_inc(sctx);
    1002           0 :                 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
    1003             :                                 scrub_fixup_nodatasum, NULL, NULL);
    1004           0 :                 btrfs_queue_work(fs_info->scrub_workers,
    1005             :                                  &fixup_nodatasum->work);
    1006           0 :                 goto out;
    1007             :         }
    1008             : 
    1009             :         /*
    1010             :          * now build and submit the bios for the other mirrors, check
    1011             :          * checksums.
    1012             :          * First try to pick the mirror which is completely without I/O
    1013             :          * errors and also does not have a checksum error.
    1014             :          * If one is found, and if a checksum is present, the full block
    1015             :          * that is known to contain an error is rewritten. Afterwards
    1016             :          * the block is known to be corrected.
    1017             :          * If a mirror is found which is completely correct, and no
    1018             :          * checksum is present, only those pages are rewritten that had
    1019             :          * an I/O error in the block to be repaired, since it cannot be
    1020             :          * determined, which copy of the other pages is better (and it
    1021             :          * could happen otherwise that a correct page would be
    1022             :          * overwritten by a bad one).
    1023             :          */
    1024           0 :         for (mirror_index = 0;
    1025           0 :              mirror_index < BTRFS_MAX_MIRRORS &&
    1026           0 :              sblocks_for_recheck[mirror_index].page_count > 0;
    1027           0 :              mirror_index++) {
    1028             :                 struct scrub_block *sblock_other;
    1029             : 
    1030           0 :                 if (mirror_index == failed_mirror_index)
    1031           0 :                         continue;
    1032             :                 sblock_other = sblocks_for_recheck + mirror_index;
    1033             : 
    1034             :                 /* build and submit the bios, check checksums */
    1035           0 :                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
    1036             :                                     have_csum, csum, generation,
    1037           0 :                                     sctx->csum_size);
    1038             : 
    1039           0 :                 if (!sblock_other->header_error &&
    1040           0 :                     !sblock_other->checksum_error &&
    1041             :                     sblock_other->no_io_error_seen) {
    1042           0 :                         if (sctx->is_dev_replace) {
    1043           0 :                                 scrub_write_block_to_dev_replace(sblock_other);
    1044             :                         } else {
    1045           0 :                                 int force_write = is_metadata || have_csum;
    1046             : 
    1047           0 :                                 ret = scrub_repair_block_from_good_copy(
    1048             :                                                 sblock_bad, sblock_other,
    1049             :                                                 force_write);
    1050             :                         }
    1051           0 :                         if (0 == ret)
    1052             :                                 goto corrected_error;
    1053             :                 }
    1054             :         }
    1055             : 
    1056             :         /*
    1057             :          * for dev_replace, pick good pages and write to the target device.
    1058             :          */
    1059           0 :         if (sctx->is_dev_replace) {
    1060             :                 success = 1;
    1061           0 :                 for (page_num = 0; page_num < sblock_bad->page_count;
    1062           0 :                      page_num++) {
    1063             :                         int sub_success;
    1064             : 
    1065             :                         sub_success = 0;
    1066           0 :                         for (mirror_index = 0;
    1067           0 :                              mirror_index < BTRFS_MAX_MIRRORS &&
    1068           0 :                              sblocks_for_recheck[mirror_index].page_count > 0;
    1069           0 :                              mirror_index++) {
    1070             :                                 struct scrub_block *sblock_other =
    1071             :                                         sblocks_for_recheck + mirror_index;
    1072           0 :                                 struct scrub_page *page_other =
    1073             :                                         sblock_other->pagev[page_num];
    1074             : 
    1075           0 :                                 if (!page_other->io_error) {
    1076           0 :                                         ret = scrub_write_page_to_dev_replace(
    1077             :                                                         sblock_other, page_num);
    1078           0 :                                         if (ret == 0) {
    1079             :                                                 /* succeeded for this page */
    1080             :                                                 sub_success = 1;
    1081             :                                                 break;
    1082             :                                         } else {
    1083           0 :                                                 btrfs_dev_replace_stats_inc(
    1084           0 :                                                         &sctx->dev_root->
    1085             :                                                         fs_info->dev_replace.
    1086             :                                                         num_write_errors);
    1087             :                                         }
    1088             :                                 }
    1089             :                         }
    1090             : 
    1091           0 :                         if (!sub_success) {
    1092             :                                 /*
    1093             :                                  * did not find a mirror to fetch the page
    1094             :                                  * from. scrub_write_page_to_dev_replace()
    1095             :                                  * handles this case (page->io_error), by
    1096             :                                  * filling the block with zeros before
    1097             :                                  * submitting the write request
    1098             :                                  */
    1099             :                                 success = 0;
    1100           0 :                                 ret = scrub_write_page_to_dev_replace(
    1101             :                                                 sblock_bad, page_num);
    1102           0 :                                 if (ret)
    1103           0 :                                         btrfs_dev_replace_stats_inc(
    1104           0 :                                                 &sctx->dev_root->fs_info->
    1105             :                                                 dev_replace.num_write_errors);
    1106             :                         }
    1107             :                 }
    1108             : 
    1109             :                 goto out;
    1110             :         }
    1111             : 
    1112             :         /*
    1113             :          * for regular scrub, repair those pages that are errored.
    1114             :          * In case of I/O errors in the area that is supposed to be
    1115             :          * repaired, continue by picking good copies of those pages.
    1116             :          * Select the good pages from mirrors to rewrite bad pages from
    1117             :          * the area to fix. Afterwards verify the checksum of the block
    1118             :          * that is supposed to be repaired. This verification step is
    1119             :          * only done for the purpose of statistic counting and for the
    1120             :          * final scrub report, whether errors remain.
    1121             :          * A perfect algorithm could make use of the checksum and try
    1122             :          * all possible combinations of pages from the different mirrors
    1123             :          * until the checksum verification succeeds. For example, when
    1124             :          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
    1125             :          * of mirror #2 is readable but the final checksum test fails,
    1126             :          * then the 2nd page of mirror #3 could be tried, whether now
    1127             :          * the final checksum succeedes. But this would be a rare
    1128             :          * exception and is therefore not implemented. At least it is
    1129             :          * avoided that the good copy is overwritten.
    1130             :          * A more useful improvement would be to pick the sectors
    1131             :          * without I/O error based on sector sizes (512 bytes on legacy
    1132             :          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
    1133             :          * mirror could be repaired by taking 512 byte of a different
    1134             :          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
    1135             :          * area are unreadable.
    1136             :          */
    1137             : 
    1138             :         /* can only fix I/O errors from here on */
    1139           0 :         if (sblock_bad->no_io_error_seen)
    1140             :                 goto did_not_correct_error;
    1141             : 
    1142             :         success = 1;
    1143           0 :         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
    1144           0 :                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
    1145             : 
    1146           0 :                 if (!page_bad->io_error)
    1147           0 :                         continue;
    1148             : 
    1149           0 :                 for (mirror_index = 0;
    1150           0 :                      mirror_index < BTRFS_MAX_MIRRORS &&
    1151           0 :                      sblocks_for_recheck[mirror_index].page_count > 0;
    1152           0 :                      mirror_index++) {
    1153             :                         struct scrub_block *sblock_other = sblocks_for_recheck +
    1154             :                                                            mirror_index;
    1155           0 :                         struct scrub_page *page_other = sblock_other->pagev[
    1156             :                                                         page_num];
    1157             : 
    1158           0 :                         if (!page_other->io_error) {
    1159           0 :                                 ret = scrub_repair_page_from_good_copy(
    1160             :                                         sblock_bad, sblock_other, page_num, 0);
    1161           0 :                                 if (0 == ret) {
    1162           0 :                                         page_bad->io_error = 0;
    1163           0 :                                         break; /* succeeded for this page */
    1164             :                                 }
    1165             :                         }
    1166             :                 }
    1167             : 
    1168           0 :                 if (page_bad->io_error) {
    1169             :                         /* did not find a mirror to copy the page from */
    1170             :                         success = 0;
    1171             :                 }
    1172             :         }
    1173             : 
    1174           0 :         if (success) {
    1175           0 :                 if (is_metadata || have_csum) {
    1176             :                         /*
    1177             :                          * need to verify the checksum now that all
    1178             :                          * sectors on disk are repaired (the write
    1179             :                          * request for data to be repaired is on its way).
    1180             :                          * Just be lazy and use scrub_recheck_block()
    1181             :                          * which re-reads the data before the checksum
    1182             :                          * is verified, but most likely the data comes out
    1183             :                          * of the page cache.
    1184             :                          */
    1185           0 :                         scrub_recheck_block(fs_info, sblock_bad,
    1186             :                                             is_metadata, have_csum, csum,
    1187           0 :                                             generation, sctx->csum_size);
    1188           0 :                         if (!sblock_bad->header_error &&
    1189           0 :                             !sblock_bad->checksum_error &&
    1190             :                             sblock_bad->no_io_error_seen)
    1191             :                                 goto corrected_error;
    1192             :                         else
    1193             :                                 goto did_not_correct_error;
    1194             :                 } else {
    1195             : corrected_error:
    1196             :                         spin_lock(&sctx->stat_lock);
    1197           0 :                         sctx->stat.corrected_errors++;
    1198             :                         spin_unlock(&sctx->stat_lock);
    1199           0 :                         printk_ratelimited_in_rcu(KERN_ERR
    1200             :                                 "BTRFS: fixed up error at logical %llu on dev %s\n",
    1201             :                                 logical, rcu_str_deref(dev->name));
    1202             :                 }
    1203             :         } else {
    1204             : did_not_correct_error:
    1205             :                 spin_lock(&sctx->stat_lock);
    1206           0 :                 sctx->stat.uncorrectable_errors++;
    1207             :                 spin_unlock(&sctx->stat_lock);
    1208           0 :                 printk_ratelimited_in_rcu(KERN_ERR
    1209             :                         "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
    1210             :                         logical, rcu_str_deref(dev->name));
    1211             :         }
    1212             : 
    1213             : out:
    1214           0 :         if (sblocks_for_recheck) {
    1215           0 :                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
    1216           0 :                      mirror_index++) {
    1217           0 :                         struct scrub_block *sblock = sblocks_for_recheck +
    1218             :                                                      mirror_index;
    1219             :                         int page_index;
    1220             : 
    1221           0 :                         for (page_index = 0; page_index < sblock->page_count;
    1222           0 :                              page_index++) {
    1223           0 :                                 sblock->pagev[page_index]->sblock = NULL;
    1224           0 :                                 scrub_page_put(sblock->pagev[page_index]);
    1225             :                         }
    1226             :                 }
    1227           0 :                 kfree(sblocks_for_recheck);
    1228             :         }
    1229             : 
    1230             :         return 0;
    1231             : }
    1232             : 
    1233           0 : static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
    1234             :                                      struct btrfs_fs_info *fs_info,
    1235             :                                      struct scrub_block *original_sblock,
    1236             :                                      u64 length, u64 logical,
    1237             :                                      struct scrub_block *sblocks_for_recheck)
    1238             : {
    1239             :         int page_index;
    1240             :         int mirror_index;
    1241             :         int ret;
    1242             : 
    1243             :         /*
    1244             :          * note: the two members ref_count and outstanding_pages
    1245             :          * are not used (and not set) in the blocks that are used for
    1246             :          * the recheck procedure
    1247             :          */
    1248             : 
    1249             :         page_index = 0;
    1250           0 :         while (length > 0) {
    1251           0 :                 u64 sublen = min_t(u64, length, PAGE_SIZE);
    1252           0 :                 u64 mapped_length = sublen;
    1253           0 :                 struct btrfs_bio *bbio = NULL;
    1254             : 
    1255             :                 /*
    1256             :                  * with a length of PAGE_SIZE, each returned stripe
    1257             :                  * represents one mirror
    1258             :                  */
    1259           0 :                 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
    1260             :                                       &mapped_length, &bbio, 0);
    1261           0 :                 if (ret || !bbio || mapped_length < sublen) {
    1262           0 :                         kfree(bbio);
    1263           0 :                         return -EIO;
    1264             :                 }
    1265             : 
    1266           0 :                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
    1267           0 :                 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
    1268           0 :                      mirror_index++) {
    1269             :                         struct scrub_block *sblock;
    1270             :                         struct scrub_page *page;
    1271             : 
    1272           0 :                         if (mirror_index >= BTRFS_MAX_MIRRORS)
    1273           0 :                                 continue;
    1274             : 
    1275           0 :                         sblock = sblocks_for_recheck + mirror_index;
    1276           0 :                         sblock->sctx = sctx;
    1277           0 :                         page = kzalloc(sizeof(*page), GFP_NOFS);
    1278           0 :                         if (!page) {
    1279             : leave_nomem:
    1280             :                                 spin_lock(&sctx->stat_lock);
    1281           0 :                                 sctx->stat.malloc_errors++;
    1282             :                                 spin_unlock(&sctx->stat_lock);
    1283           0 :                                 kfree(bbio);
    1284           0 :                                 return -ENOMEM;
    1285             :                         }
    1286             :                         scrub_page_get(page);
    1287           0 :                         sblock->pagev[page_index] = page;
    1288           0 :                         page->logical = logical;
    1289           0 :                         page->physical = bbio->stripes[mirror_index].physical;
    1290           0 :                         BUG_ON(page_index >= original_sblock->page_count);
    1291           0 :                         page->physical_for_dev_replace =
    1292           0 :                                 original_sblock->pagev[page_index]->
    1293             :                                 physical_for_dev_replace;
    1294             :                         /* for missing devices, dev->bdev is NULL */
    1295           0 :                         page->dev = bbio->stripes[mirror_index].dev;
    1296           0 :                         page->mirror_num = mirror_index + 1;
    1297           0 :                         sblock->page_count++;
    1298           0 :                         page->page = alloc_page(GFP_NOFS);
    1299           0 :                         if (!page->page)
    1300             :                                 goto leave_nomem;
    1301             :                 }
    1302           0 :                 kfree(bbio);
    1303           0 :                 length -= sublen;
    1304           0 :                 logical += sublen;
    1305           0 :                 page_index++;
    1306             :         }
    1307             : 
    1308             :         return 0;
    1309             : }
    1310             : 
    1311             : /*
    1312             :  * this function will check the on disk data for checksum errors, header
    1313             :  * errors and read I/O errors. If any I/O errors happen, the exact pages
    1314             :  * which are errored are marked as being bad. The goal is to enable scrub
    1315             :  * to take those pages that are not errored from all the mirrors so that
    1316             :  * the pages that are errored in the just handled mirror can be repaired.
    1317             :  */
    1318           0 : static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
    1319             :                                 struct scrub_block *sblock, int is_metadata,
    1320             :                                 int have_csum, u8 *csum, u64 generation,
    1321             :                                 u16 csum_size)
    1322             : {
    1323             :         int page_num;
    1324             : 
    1325           0 :         sblock->no_io_error_seen = 1;
    1326           0 :         sblock->header_error = 0;
    1327           0 :         sblock->checksum_error = 0;
    1328             : 
    1329           0 :         for (page_num = 0; page_num < sblock->page_count; page_num++) {
    1330             :                 struct bio *bio;
    1331           0 :                 struct scrub_page *page = sblock->pagev[page_num];
    1332             : 
    1333           0 :                 if (page->dev->bdev == NULL) {
    1334           0 :                         page->io_error = 1;
    1335           0 :                         sblock->no_io_error_seen = 0;
    1336           0 :                         continue;
    1337             :                 }
    1338             : 
    1339           0 :                 WARN_ON(!page->page);
    1340           0 :                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
    1341           0 :                 if (!bio) {
    1342           0 :                         page->io_error = 1;
    1343           0 :                         sblock->no_io_error_seen = 0;
    1344           0 :                         continue;
    1345             :                 }
    1346           0 :                 bio->bi_bdev = page->dev->bdev;
    1347           0 :                 bio->bi_iter.bi_sector = page->physical >> 9;
    1348             : 
    1349           0 :                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
    1350           0 :                 if (btrfsic_submit_bio_wait(READ, bio))
    1351           0 :                         sblock->no_io_error_seen = 0;
    1352             : 
    1353           0 :                 bio_put(bio);
    1354             :         }
    1355             : 
    1356           0 :         if (sblock->no_io_error_seen)
    1357           0 :                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
    1358             :                                              have_csum, csum, generation,
    1359             :                                              csum_size);
    1360             : 
    1361           0 :         return;
    1362             : }
    1363             : 
    1364           0 : static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
    1365             :                                          struct scrub_block *sblock,
    1366             :                                          int is_metadata, int have_csum,
    1367             :                                          const u8 *csum, u64 generation,
    1368             :                                          u16 csum_size)
    1369             : {
    1370             :         int page_num;
    1371             :         u8 calculated_csum[BTRFS_CSUM_SIZE];
    1372             :         u32 crc = ~(u32)0;
    1373             :         void *mapped_buffer;
    1374             : 
    1375           0 :         WARN_ON(!sblock->pagev[0]->page);
    1376           0 :         if (is_metadata) {
    1377             :                 struct btrfs_header *h;
    1378             : 
    1379           0 :                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
    1380             :                 h = (struct btrfs_header *)mapped_buffer;
    1381             : 
    1382           0 :                 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
    1383           0 :                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
    1384           0 :                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
    1385             :                            BTRFS_UUID_SIZE)) {
    1386           0 :                         sblock->header_error = 1;
    1387           0 :                 } else if (generation != btrfs_stack_header_generation(h)) {
    1388           0 :                         sblock->header_error = 1;
    1389           0 :                         sblock->generation_error = 1;
    1390             :                 }
    1391           0 :                 csum = h->csum;
    1392             :         } else {
    1393           0 :                 if (!have_csum)
    1394           0 :                         return;
    1395             : 
    1396           0 :                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
    1397             :         }
    1398             : 
    1399             :         for (page_num = 0;;) {
    1400           0 :                 if (page_num == 0 && is_metadata)
    1401           0 :                         crc = btrfs_csum_data(
    1402             :                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
    1403             :                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
    1404             :                 else
    1405           0 :                         crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
    1406             : 
    1407             :                 kunmap_atomic(mapped_buffer);
    1408           0 :                 page_num++;
    1409           0 :                 if (page_num >= sblock->page_count)
    1410             :                         break;
    1411           0 :                 WARN_ON(!sblock->pagev[page_num]->page);
    1412             : 
    1413           0 :                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
    1414           0 :         }
    1415             : 
    1416           0 :         btrfs_csum_final(crc, calculated_csum);
    1417           0 :         if (memcmp(calculated_csum, csum, csum_size))
    1418           0 :                 sblock->checksum_error = 1;
    1419             : }
    1420             : 
    1421           0 : static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
    1422             :                                              struct scrub_block *sblock_good,
    1423             :                                              int force_write)
    1424             : {
    1425             :         int page_num;
    1426             :         int ret = 0;
    1427             : 
    1428           0 :         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
    1429             :                 int ret_sub;
    1430             : 
    1431           0 :                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
    1432             :                                                            sblock_good,
    1433             :                                                            page_num,
    1434             :                                                            force_write);
    1435           0 :                 if (ret_sub)
    1436             :                         ret = ret_sub;
    1437             :         }
    1438             : 
    1439           0 :         return ret;
    1440             : }
    1441             : 
    1442           0 : static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
    1443             :                                             struct scrub_block *sblock_good,
    1444             :                                             int page_num, int force_write)
    1445             : {
    1446           0 :         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
    1447           0 :         struct scrub_page *page_good = sblock_good->pagev[page_num];
    1448             : 
    1449           0 :         BUG_ON(page_bad->page == NULL);
    1450           0 :         BUG_ON(page_good->page == NULL);
    1451           0 :         if (force_write || sblock_bad->header_error ||
    1452           0 :             sblock_bad->checksum_error || page_bad->io_error) {
    1453             :                 struct bio *bio;
    1454             :                 int ret;
    1455             : 
    1456           0 :                 if (!page_bad->dev->bdev) {
    1457           0 :                         printk_ratelimited(KERN_WARNING "BTRFS: "
    1458             :                                 "scrub_repair_page_from_good_copy(bdev == NULL) "
    1459             :                                 "is unexpected!\n");
    1460             :                         return -EIO;
    1461             :                 }
    1462             : 
    1463           0 :                 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
    1464           0 :                 if (!bio)
    1465             :                         return -EIO;
    1466           0 :                 bio->bi_bdev = page_bad->dev->bdev;
    1467           0 :                 bio->bi_iter.bi_sector = page_bad->physical >> 9;
    1468             : 
    1469           0 :                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
    1470           0 :                 if (PAGE_SIZE != ret) {
    1471           0 :                         bio_put(bio);
    1472           0 :                         return -EIO;
    1473             :                 }
    1474             : 
    1475           0 :                 if (btrfsic_submit_bio_wait(WRITE, bio)) {
    1476           0 :                         btrfs_dev_stat_inc_and_print(page_bad->dev,
    1477             :                                 BTRFS_DEV_STAT_WRITE_ERRS);
    1478           0 :                         btrfs_dev_replace_stats_inc(
    1479           0 :                                 &sblock_bad->sctx->dev_root->fs_info->
    1480             :                                 dev_replace.num_write_errors);
    1481           0 :                         bio_put(bio);
    1482           0 :                         return -EIO;
    1483             :                 }
    1484           0 :                 bio_put(bio);
    1485             :         }
    1486             : 
    1487             :         return 0;
    1488             : }
    1489             : 
    1490     1433167 : static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
    1491             : {
    1492             :         int page_num;
    1493             : 
    1494     2872847 :         for (page_num = 0; page_num < sblock->page_count; page_num++) {
    1495             :                 int ret;
    1496             : 
    1497     1439680 :                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
    1498     1439680 :                 if (ret)
    1499           0 :                         btrfs_dev_replace_stats_inc(
    1500           0 :                                 &sblock->sctx->dev_root->fs_info->dev_replace.
    1501             :                                 num_write_errors);
    1502             :         }
    1503     1433167 : }
    1504             : 
    1505     1439680 : static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
    1506             :                                            int page_num)
    1507             : {
    1508     1439680 :         struct scrub_page *spage = sblock->pagev[page_num];
    1509             : 
    1510     1439680 :         BUG_ON(spage->page == NULL);
    1511     1439680 :         if (spage->io_error) {
    1512             :                 void *mapped_buffer = kmap_atomic(spage->page);
    1513             : 
    1514           0 :                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
    1515             :                 flush_dcache_page(spage->page);
    1516             :                 kunmap_atomic(mapped_buffer);
    1517             :         }
    1518     1439680 :         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
    1519             : }
    1520             : 
    1521     1439680 : static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
    1522             :                                     struct scrub_page *spage)
    1523             : {
    1524             :         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
    1525             :         struct scrub_bio *sbio;
    1526             :         int ret;
    1527             : 
    1528     1439680 :         mutex_lock(&wr_ctx->wr_lock);
    1529             : again:
    1530     1440709 :         if (!wr_ctx->wr_curr_bio) {
    1531       45829 :                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
    1532             :                                               GFP_NOFS);
    1533       45829 :                 if (!wr_ctx->wr_curr_bio) {
    1534           0 :                         mutex_unlock(&wr_ctx->wr_lock);
    1535           0 :                         return -ENOMEM;
    1536             :                 }
    1537       45829 :                 wr_ctx->wr_curr_bio->sctx = sctx;
    1538       45829 :                 wr_ctx->wr_curr_bio->page_count = 0;
    1539             :         }
    1540     1440709 :         sbio = wr_ctx->wr_curr_bio;
    1541     1440709 :         if (sbio->page_count == 0) {
    1542             :                 struct bio *bio;
    1543             : 
    1544       45829 :                 sbio->physical = spage->physical_for_dev_replace;
    1545       45829 :                 sbio->logical = spage->logical;
    1546       45829 :                 sbio->dev = wr_ctx->tgtdev;
    1547       45829 :                 bio = sbio->bio;
    1548       45829 :                 if (!bio) {
    1549       45829 :                         bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
    1550       45829 :                         if (!bio) {
    1551           0 :                                 mutex_unlock(&wr_ctx->wr_lock);
    1552           0 :                                 return -ENOMEM;
    1553             :                         }
    1554       45829 :                         sbio->bio = bio;
    1555             :                 }
    1556             : 
    1557       45829 :                 bio->bi_private = sbio;
    1558       45829 :                 bio->bi_end_io = scrub_wr_bio_end_io;
    1559       45829 :                 bio->bi_bdev = sbio->dev->bdev;
    1560       45829 :                 bio->bi_iter.bi_sector = sbio->physical >> 9;
    1561       45829 :                 sbio->err = 0;
    1562     2789760 :         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
    1563     2789066 :                    spage->physical_for_dev_replace ||
    1564     1394186 :                    sbio->logical + sbio->page_count * PAGE_SIZE !=
    1565     1394186 :                    spage->logical) {
    1566        1029 :                 scrub_wr_submit(sctx);
    1567        1029 :                 goto again;
    1568             :         }
    1569             : 
    1570     1439680 :         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
    1571     1439680 :         if (ret != PAGE_SIZE) {
    1572           0 :                 if (sbio->page_count < 1) {
    1573           0 :                         bio_put(sbio->bio);
    1574           0 :                         sbio->bio = NULL;
    1575           0 :                         mutex_unlock(&wr_ctx->wr_lock);
    1576           0 :                         return -EIO;
    1577             :                 }
    1578           0 :                 scrub_wr_submit(sctx);
    1579           0 :                 goto again;
    1580             :         }
    1581             : 
    1582     1439680 :         sbio->pagev[sbio->page_count] = spage;
    1583             :         scrub_page_get(spage);
    1584     1439680 :         sbio->page_count++;
    1585     1439680 :         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
    1586       44494 :                 scrub_wr_submit(sctx);
    1587     1439680 :         mutex_unlock(&wr_ctx->wr_lock);
    1588             : 
    1589     1439680 :         return 0;
    1590             : }
    1591             : 
    1592       47004 : static void scrub_wr_submit(struct scrub_ctx *sctx)
    1593             : {
    1594             :         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
    1595             :         struct scrub_bio *sbio;
    1596             : 
    1597       47004 :         if (!wr_ctx->wr_curr_bio)
    1598       47004 :                 return;
    1599             : 
    1600             :         sbio = wr_ctx->wr_curr_bio;
    1601       45829 :         wr_ctx->wr_curr_bio = NULL;
    1602       45829 :         WARN_ON(!sbio->bio->bi_bdev);
    1603             :         scrub_pending_bio_inc(sctx);
    1604             :         /* process all writes in a single worker thread. Then the block layer
    1605             :          * orders the requests before sending them to the driver which
    1606             :          * doubled the write performance on spinning disks when measured
    1607             :          * with Linux 3.5 */
    1608       45829 :         btrfsic_submit_bio(WRITE, sbio->bio);
    1609             : }
    1610             : 
    1611       45829 : static void scrub_wr_bio_end_io(struct bio *bio, int err)
    1612             : {
    1613       45829 :         struct scrub_bio *sbio = bio->bi_private;
    1614       45829 :         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
    1615             : 
    1616       45829 :         sbio->err = err;
    1617       45829 :         sbio->bio = bio;
    1618             : 
    1619       45829 :         btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
    1620             :                          scrub_wr_bio_end_io_worker, NULL, NULL);
    1621       45829 :         btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
    1622       45829 : }
    1623             : 
    1624       45829 : static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
    1625             : {
    1626       45829 :         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
    1627       45829 :         struct scrub_ctx *sctx = sbio->sctx;
    1628             :         int i;
    1629             : 
    1630       45829 :         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
    1631       45818 :         if (sbio->err) {
    1632             :                 struct btrfs_dev_replace *dev_replace =
    1633           0 :                         &sbio->sctx->dev_root->fs_info->dev_replace;
    1634             : 
    1635           0 :                 for (i = 0; i < sbio->page_count; i++) {
    1636           0 :                         struct scrub_page *spage = sbio->pagev[i];
    1637             : 
    1638           0 :                         spage->io_error = 1;
    1639           0 :                         btrfs_dev_replace_stats_inc(&dev_replace->
    1640             :                                                     num_write_errors);
    1641             :                 }
    1642             :         }
    1643             : 
    1644     1439118 :         for (i = 0; i < sbio->page_count; i++)
    1645     1439107 :                 scrub_page_put(sbio->pagev[i]);
    1646             : 
    1647       45829 :         bio_put(sbio->bio);
    1648       45829 :         kfree(sbio);
    1649       45828 :         scrub_pending_bio_dec(sctx);
    1650       45829 : }
    1651             : 
    1652     4170852 : static int scrub_checksum(struct scrub_block *sblock)
    1653             : {
    1654             :         u64 flags;
    1655             :         int ret;
    1656             : 
    1657     4170852 :         WARN_ON(sblock->page_count < 1);
    1658     4170935 :         flags = sblock->pagev[0]->flags;
    1659             :         ret = 0;
    1660     4170935 :         if (flags & BTRFS_EXTENT_FLAG_DATA)
    1661     4158619 :                 ret = scrub_checksum_data(sblock);
    1662       12316 :         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
    1663       12294 :                 ret = scrub_checksum_tree_block(sblock);
    1664          22 :         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
    1665          22 :                 (void)scrub_checksum_super(sblock);
    1666             :         else
    1667           0 :                 WARN_ON(1);
    1668     4154895 :         if (ret)
    1669           0 :                 scrub_handle_errored_block(sblock);
    1670             : 
    1671     4154895 :         return ret;
    1672             : }
    1673             : 
    1674     4157172 : static int scrub_checksum_data(struct scrub_block *sblock)
    1675             : {
    1676     4157172 :         struct scrub_ctx *sctx = sblock->sctx;
    1677             :         u8 csum[BTRFS_CSUM_SIZE];
    1678             :         u8 *on_disk_csum;
    1679             :         struct page *page;
    1680             :         void *buffer;
    1681             :         u32 crc = ~(u32)0;
    1682             :         int fail = 0;
    1683             :         u64 len;
    1684             :         int index;
    1685             : 
    1686     4157172 :         BUG_ON(sblock->page_count < 1);
    1687     4157172 :         if (!sblock->pagev[0]->have_csum)
    1688             :                 return 0;
    1689             : 
    1690     4154856 :         on_disk_csum = sblock->pagev[0]->csum;
    1691     4154856 :         page = sblock->pagev[0]->page;
    1692             :         buffer = kmap_atomic(page);
    1693             : 
    1694     4154623 :         len = sctx->sectorsize;
    1695             :         index = 0;
    1696             :         for (;;) {
    1697     4154623 :                 u64 l = min_t(u64, len, PAGE_SIZE);
    1698             : 
    1699     4154623 :                 crc = btrfs_csum_data(buffer, crc, l);
    1700             :                 kunmap_atomic(buffer);
    1701     4144942 :                 len -= l;
    1702     4144942 :                 if (len == 0)
    1703             :                         break;
    1704           0 :                 index++;
    1705           0 :                 BUG_ON(index >= sblock->page_count);
    1706           0 :                 BUG_ON(!sblock->pagev[index]->page);
    1707             :                 page = sblock->pagev[index]->page;
    1708             :                 buffer = kmap_atomic(page);
    1709           0 :         }
    1710             : 
    1711     4144942 :         btrfs_csum_final(crc, csum);
    1712     4141372 :         if (memcmp(csum, on_disk_csum, sctx->csum_size))
    1713             :                 fail = 1;
    1714             : 
    1715     4141372 :         return fail;
    1716             : }
    1717             : 
    1718       12293 : static int scrub_checksum_tree_block(struct scrub_block *sblock)
    1719             : {
    1720       12293 :         struct scrub_ctx *sctx = sblock->sctx;
    1721             :         struct btrfs_header *h;
    1722       12293 :         struct btrfs_root *root = sctx->dev_root;
    1723       12293 :         struct btrfs_fs_info *fs_info = root->fs_info;
    1724             :         u8 calculated_csum[BTRFS_CSUM_SIZE];
    1725             :         u8 on_disk_csum[BTRFS_CSUM_SIZE];
    1726             :         struct page *page;
    1727             :         void *mapped_buffer;
    1728             :         u64 mapped_size;
    1729             :         void *p;
    1730             :         u32 crc = ~(u32)0;
    1731             :         int fail = 0;
    1732             :         int crc_fail = 0;
    1733             :         u64 len;
    1734             :         int index;
    1735             : 
    1736       12293 :         BUG_ON(sblock->page_count < 1);
    1737       12293 :         page = sblock->pagev[0]->page;
    1738             :         mapped_buffer = kmap_atomic(page);
    1739             :         h = (struct btrfs_header *)mapped_buffer;
    1740       12293 :         memcpy(on_disk_csum, h->csum, sctx->csum_size);
    1741             : 
    1742             :         /*
    1743             :          * we don't use the getter functions here, as we
    1744             :          * a) don't have an extent buffer and
    1745             :          * b) the page is already kmapped
    1746             :          */
    1747             : 
    1748       24586 :         if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
    1749             :                 ++fail;
    1750             : 
    1751       24586 :         if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
    1752           0 :                 ++fail;
    1753             : 
    1754       12293 :         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
    1755           0 :                 ++fail;
    1756             : 
    1757       12293 :         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
    1758             :                    BTRFS_UUID_SIZE))
    1759           0 :                 ++fail;
    1760             : 
    1761       12293 :         WARN_ON(sctx->nodesize != sctx->leafsize);
    1762       12291 :         len = sctx->nodesize - BTRFS_CSUM_SIZE;
    1763             :         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
    1764       12291 :         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
    1765             :         index = 0;
    1766             :         for (;;) {
    1767       28332 :                 u64 l = min_t(u64, len, mapped_size);
    1768             : 
    1769       28332 :                 crc = btrfs_csum_data(p, crc, l);
    1770             :                 kunmap_atomic(mapped_buffer);
    1771       28333 :                 len -= l;
    1772       28333 :                 if (len == 0)
    1773             :                         break;
    1774       16040 :                 index++;
    1775       16040 :                 BUG_ON(index >= sblock->page_count);
    1776       16040 :                 BUG_ON(!sblock->pagev[index]->page);
    1777             :                 page = sblock->pagev[index]->page;
    1778             :                 mapped_buffer = kmap_atomic(page);
    1779             :                 mapped_size = PAGE_SIZE;
    1780             :                 p = mapped_buffer;
    1781       16041 :         }
    1782             : 
    1783       12293 :         btrfs_csum_final(crc, calculated_csum);
    1784       12293 :         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
    1785             :                 ++crc_fail;
    1786             : 
    1787       12293 :         return fail || crc_fail;
    1788             : }
    1789             : 
    1790          22 : static int scrub_checksum_super(struct scrub_block *sblock)
    1791             : {
    1792             :         struct btrfs_super_block *s;
    1793          22 :         struct scrub_ctx *sctx = sblock->sctx;
    1794          22 :         struct btrfs_root *root = sctx->dev_root;
    1795          22 :         struct btrfs_fs_info *fs_info = root->fs_info;
    1796             :         u8 calculated_csum[BTRFS_CSUM_SIZE];
    1797             :         u8 on_disk_csum[BTRFS_CSUM_SIZE];
    1798             :         struct page *page;
    1799             :         void *mapped_buffer;
    1800             :         u64 mapped_size;
    1801             :         void *p;
    1802             :         u32 crc = ~(u32)0;
    1803             :         int fail_gen = 0;
    1804             :         int fail_cor = 0;
    1805             :         u64 len;
    1806             :         int index;
    1807             : 
    1808          22 :         BUG_ON(sblock->page_count < 1);
    1809          22 :         page = sblock->pagev[0]->page;
    1810             :         mapped_buffer = kmap_atomic(page);
    1811             :         s = (struct btrfs_super_block *)mapped_buffer;
    1812          22 :         memcpy(on_disk_csum, s->csum, sctx->csum_size);
    1813             : 
    1814          44 :         if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
    1815             :                 ++fail_cor;
    1816             : 
    1817          44 :         if (sblock->pagev[0]->generation != btrfs_super_generation(s))
    1818             :                 ++fail_gen;
    1819             : 
    1820          22 :         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
    1821           0 :                 ++fail_cor;
    1822             : 
    1823             :         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
    1824             :         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
    1825          22 :         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
    1826             :         index = 0;
    1827             :         for (;;) {
    1828             :                 u64 l = min_t(u64, len, mapped_size);
    1829             : 
    1830          22 :                 crc = btrfs_csum_data(p, crc, l);
    1831             :                 kunmap_atomic(mapped_buffer);
    1832             :                 len -= l;
    1833             :                 if (len == 0)
    1834             :                         break;
    1835             :                 index++;
    1836             :                 BUG_ON(index >= sblock->page_count);
    1837             :                 BUG_ON(!sblock->pagev[index]->page);
    1838             :                 page = sblock->pagev[index]->page;
    1839             :                 mapped_buffer = kmap_atomic(page);
    1840             :                 mapped_size = PAGE_SIZE;
    1841             :                 p = mapped_buffer;
    1842             :         }
    1843             : 
    1844          22 :         btrfs_csum_final(crc, calculated_csum);
    1845          22 :         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
    1846           0 :                 ++fail_cor;
    1847             : 
    1848          22 :         if (fail_cor + fail_gen) {
    1849             :                 /*
    1850             :                  * if we find an error in a super block, we just report it.
    1851             :                  * They will get written with the next transaction commit
    1852             :                  * anyway
    1853             :                  */
    1854             :                 spin_lock(&sctx->stat_lock);
    1855           0 :                 ++sctx->stat.super_errors;
    1856             :                 spin_unlock(&sctx->stat_lock);
    1857           0 :                 if (fail_cor)
    1858           0 :                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
    1859             :                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
    1860             :                 else
    1861           0 :                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
    1862             :                                 BTRFS_DEV_STAT_GENERATION_ERRS);
    1863             :         }
    1864             : 
    1865          22 :         return fail_cor + fail_gen;
    1866             : }
    1867             : 
    1868             : static void scrub_block_get(struct scrub_block *sblock)
    1869             : {
    1870     4183242 :         atomic_inc(&sblock->ref_count);
    1871             : }
    1872             : 
    1873     8265757 : static void scrub_block_put(struct scrub_block *sblock)
    1874             : {
    1875    16631983 :         if (atomic_dec_and_test(&sblock->ref_count)) {
    1876             :                 int i;
    1877             : 
    1878     4177900 :                 for (i = 0; i < sblock->page_count; i++)
    1879     4189921 :                         scrub_page_put(sblock->pagev[i]);
    1880     4162133 :                 kfree(sblock);
    1881             :         }
    1882     8353580 : }
    1883             : 
    1884             : static void scrub_page_get(struct scrub_page *spage)
    1885             : {
    1886     5623179 :         atomic_inc(&spage->ref_count);
    1887             : }
    1888             : 
    1889     5621356 : static void scrub_page_put(struct scrub_page *spage)
    1890             : {
    1891    11250015 :         if (atomic_dec_and_test(&spage->ref_count)) {
    1892     4188979 :                 if (spage->page)
    1893     4189072 :                         __free_page(spage->page);
    1894     4176895 :                 kfree(spage);
    1895             :         }
    1896     5618558 : }
    1897             : 
    1898      133321 : static void scrub_submit(struct scrub_ctx *sctx)
    1899             : {
    1900             :         struct scrub_bio *sbio;
    1901             : 
    1902      133321 :         if (sctx->curr == -1)
    1903      133319 :                 return;
    1904             : 
    1905      133167 :         sbio = sctx->bios[sctx->curr];
    1906      133167 :         sctx->curr = -1;
    1907             :         scrub_pending_bio_inc(sctx);
    1908             : 
    1909      133172 :         if (!sbio->bio->bi_bdev) {
    1910             :                 /*
    1911             :                  * this case should not happen. If btrfs_map_block() is
    1912             :                  * wrong, it could happen for dev-replace operations on
    1913             :                  * missing devices when no mirrors are available, but in
    1914             :                  * this case it should already fail the mount.
    1915             :                  * This case is handled correctly (but _very_ slowly).
    1916             :                  */
    1917           0 :                 printk_ratelimited(KERN_WARNING
    1918             :                         "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
    1919           0 :                 bio_endio(sbio->bio, -EIO);
    1920             :         } else {
    1921      133172 :                 btrfsic_submit_bio(READ, sbio->bio);
    1922             :         }
    1923             : }
    1924             : 
    1925     4186357 : static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
    1926             :                                     struct scrub_page *spage)
    1927             : {
    1928     4186357 :         struct scrub_block *sblock = spage->sblock;
    1929             :         struct scrub_bio *sbio;
    1930             :         int ret;
    1931             : 
    1932             : again:
    1933             :         /*
    1934             :          * grab a fresh bio or wait for one to become available
    1935             :          */
    1936     4366352 :         while (sctx->curr == -1) {
    1937             :                 spin_lock(&sctx->list_lock);
    1938      176557 :                 sctx->curr = sctx->first_free;
    1939      176557 :                 if (sctx->curr != -1) {
    1940      133170 :                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
    1941      133170 :                         sctx->bios[sctx->curr]->next_free = -1;
    1942      133170 :                         sctx->bios[sctx->curr]->page_count = 0;
    1943             :                         spin_unlock(&sctx->list_lock);
    1944             :                 } else {
    1945             :                         spin_unlock(&sctx->list_lock);
    1946       86774 :                         wait_event(sctx->list_wait, sctx->first_free != -1);
    1947             :                 }
    1948             :         }
    1949     4189804 :         sbio = sctx->bios[sctx->curr];
    1950     4189804 :         if (sbio->page_count == 0) {
    1951             :                 struct bio *bio;
    1952             : 
    1953      133163 :                 sbio->physical = spage->physical;
    1954      133163 :                 sbio->logical = spage->logical;
    1955      133163 :                 sbio->dev = spage->dev;
    1956      133163 :                 bio = sbio->bio;
    1957      133163 :                 if (!bio) {
    1958      133162 :                         bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
    1959      133167 :                         if (!bio)
    1960             :                                 return -ENOMEM;
    1961      133167 :                         sbio->bio = bio;
    1962             :                 }
    1963             : 
    1964      133168 :                 bio->bi_private = sbio;
    1965      133168 :                 bio->bi_end_io = scrub_bio_end_io;
    1966      133168 :                 bio->bi_bdev = sbio->dev->bdev;
    1967      133168 :                 bio->bi_iter.bi_sector = sbio->physical >> 9;
    1968      133168 :                 sbio->err = 0;
    1969     8113282 :         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
    1970     8111850 :                    spage->physical ||
    1971     4055209 :                    sbio->logical + sbio->page_count * PAGE_SIZE !=
    1972     8108431 :                    spage->logical ||
    1973     4053222 :                    sbio->dev != spage->dev) {
    1974        3446 :                 scrub_submit(sctx);
    1975        3446 :                 goto again;
    1976             :         }
    1977             : 
    1978     4186363 :         sbio->pagev[sbio->page_count] = spage;
    1979     4186363 :         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
    1980     4183242 :         if (ret != PAGE_SIZE) {
    1981           0 :                 if (sbio->page_count < 1) {
    1982           0 :                         bio_put(sbio->bio);
    1983           0 :                         sbio->bio = NULL;
    1984           0 :                         return -EIO;
    1985             :                 }
    1986           0 :                 scrub_submit(sctx);
    1987           0 :                 goto again;
    1988             :         }
    1989             : 
    1990             :         scrub_block_get(sblock); /* one for the page added to the bio */
    1991     4192397 :         atomic_inc(&sblock->outstanding_pages);
    1992     4191633 :         sbio->page_count++;
    1993     4191633 :         if (sbio->page_count == sctx->pages_per_rd_bio)
    1994      129630 :                 scrub_submit(sctx);
    1995             : 
    1996             :         return 0;
    1997             : }
    1998             : 
    1999     4171571 : static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
    2000             :                        u64 physical, struct btrfs_device *dev, u64 flags,
    2001             :                        u64 gen, int mirror_num, u8 *csum, int force,
    2002             :                        u64 physical_for_dev_replace)
    2003             : {
    2004             :         struct scrub_block *sblock;
    2005             :         int index;
    2006             : 
    2007     4171571 :         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
    2008     4170451 :         if (!sblock) {
    2009             :                 spin_lock(&sctx->stat_lock);
    2010           0 :                 sctx->stat.malloc_errors++;
    2011             :                 spin_unlock(&sctx->stat_lock);
    2012           0 :                 return -ENOMEM;
    2013             :         }
    2014             : 
    2015             :         /* one ref inside this function, plus one for each page added to
    2016             :          * a bio later on */
    2017             :         atomic_set(&sblock->ref_count, 1);
    2018     4170451 :         sblock->sctx = sctx;
    2019     4170451 :         sblock->no_io_error_seen = 1;
    2020             : 
    2021     8357753 :         for (index = 0; len > 0; index++) {
    2022             :                 struct scrub_page *spage;
    2023     4186403 :                 u64 l = min_t(u64, len, PAGE_SIZE);
    2024             : 
    2025     4186403 :                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
    2026     4183396 :                 if (!spage) {
    2027             : leave_nomem:
    2028             :                         spin_lock(&sctx->stat_lock);
    2029           0 :                         sctx->stat.malloc_errors++;
    2030             :                         spin_unlock(&sctx->stat_lock);
    2031           0 :                         scrub_block_put(sblock);
    2032           0 :                         return -ENOMEM;
    2033             :                 }
    2034     4183499 :                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
    2035             :                 scrub_page_get(spage);
    2036     4189291 :                 sblock->pagev[index] = spage;
    2037     4189291 :                 spage->sblock = sblock;
    2038     4189291 :                 spage->dev = dev;
    2039     4189291 :                 spage->flags = flags;
    2040     4189291 :                 spage->generation = gen;
    2041     4189291 :                 spage->logical = logical;
    2042     4189291 :                 spage->physical = physical;
    2043     4189291 :                 spage->physical_for_dev_replace = physical_for_dev_replace;
    2044     4189291 :                 spage->mirror_num = mirror_num;
    2045     4189291 :                 if (csum) {
    2046     4159051 :                         spage->have_csum = 1;
    2047     4159051 :                         memcpy(spage->csum, csum, sctx->csum_size);
    2048             :                 } else {
    2049       30240 :                         spage->have_csum = 0;
    2050             :                 }
    2051     4189291 :                 sblock->page_count++;
    2052     4187255 :                 spage->page = alloc_page(GFP_NOFS);
    2053     4187255 :                 if (!spage->page)
    2054             :                         goto leave_nomem;
    2055     4187302 :                 len -= l;
    2056     4187302 :                 logical += l;
    2057     4187302 :                 physical += l;
    2058     4187302 :                 physical_for_dev_replace += l;
    2059             :         }
    2060             : 
    2061     4171350 :         WARN_ON(sblock->page_count == 0);
    2062     4189163 :         for (index = 0; index < sblock->page_count; index++) {
    2063     4186344 :                 struct scrub_page *spage = sblock->pagev[index];
    2064             :                 int ret;
    2065             : 
    2066     4186344 :                 ret = scrub_add_page_to_rd_bio(sctx, spage);
    2067     4189163 :                 if (ret) {
    2068           0 :                         scrub_block_put(sblock);
    2069           0 :                         return ret;
    2070             :                 }
    2071             :         }
    2072             : 
    2073     4173276 :         if (force)
    2074          22 :                 scrub_submit(sctx);
    2075             : 
    2076             :         /* last one frees, either here or in bio completion for last page */
    2077     4173276 :         scrub_block_put(sblock);
    2078     4173312 :         return 0;
    2079             : }
    2080             : 
    2081      133148 : static void scrub_bio_end_io(struct bio *bio, int err)
    2082             : {
    2083      133148 :         struct scrub_bio *sbio = bio->bi_private;
    2084      133148 :         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
    2085             : 
    2086      133148 :         sbio->err = err;
    2087      133148 :         sbio->bio = bio;
    2088             : 
    2089      133148 :         btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
    2090      133172 : }
    2091             : 
    2092      133020 : static void scrub_bio_end_io_worker(struct btrfs_work *work)
    2093             : {
    2094             :         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
    2095      133020 :         struct scrub_ctx *sctx = sbio->sctx;
    2096             :         int i;
    2097             : 
    2098      133020 :         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
    2099      133020 :         if (sbio->err) {
    2100           0 :                 for (i = 0; i < sbio->page_count; i++) {
    2101           0 :                         struct scrub_page *spage = sbio->pagev[i];
    2102             : 
    2103           0 :                         spage->io_error = 1;
    2104           0 :                         spage->sblock->no_io_error_seen = 0;
    2105             :                 }
    2106             :         }
    2107             : 
    2108             :         /* now complete the scrub_block items that have all pages completed */
    2109     4176440 :         for (i = 0; i < sbio->page_count; i++) {
    2110     4176303 :                 struct scrub_page *spage = sbio->pagev[i];
    2111     4176303 :                 struct scrub_block *sblock = spage->sblock;
    2112             : 
    2113     8363084 :                 if (atomic_dec_and_test(&sblock->outstanding_pages))
    2114     4170724 :                         scrub_block_complete(sblock);
    2115     4170030 :                 scrub_block_put(sblock);
    2116             :         }
    2117             : 
    2118      133157 :         bio_put(sbio->bio);
    2119      133157 :         sbio->bio = NULL;
    2120             :         spin_lock(&sctx->list_lock);
    2121      133171 :         sbio->next_free = sctx->first_free;
    2122      133171 :         sctx->first_free = sbio->index;
    2123             :         spin_unlock(&sctx->list_lock);
    2124             : 
    2125      178972 :         if (sctx->is_dev_replace &&
    2126             :             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
    2127        1255 :                 mutex_lock(&sctx->wr_ctx.wr_lock);
    2128        1255 :                 scrub_wr_submit(sctx);
    2129        1255 :                 mutex_unlock(&sctx->wr_ctx.wr_lock);
    2130             :         }
    2131             : 
    2132      133163 :         scrub_pending_bio_dec(sctx);
    2133      133162 : }
    2134             : 
    2135     4171380 : static void scrub_block_complete(struct scrub_block *sblock)
    2136             : {
    2137     4171380 :         if (!sblock->no_io_error_seen) {
    2138           0 :                 scrub_handle_errored_block(sblock);
    2139             :         } else {
    2140             :                 /*
    2141             :                  * if has checksum error, write via repair mechanism in
    2142             :                  * dev replace case, otherwise write here in dev replace
    2143             :                  * case.
    2144             :                  */
    2145     4171380 :                 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
    2146     1433167 :                         scrub_write_block_to_dev_replace(sblock);
    2147             :         }
    2148     4154586 : }
    2149             : 
    2150     4160780 : static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
    2151             :                            u8 *csum)
    2152             : {
    2153             :         struct btrfs_ordered_sum *sum = NULL;
    2154             :         unsigned long index;
    2155             :         unsigned long num_sectors;
    2156             : 
    2157     8341248 :         while (!list_empty(&sctx->csum_list)) {
    2158     4167655 :                 sum = list_first_entry(&sctx->csum_list,
    2159             :                                        struct btrfs_ordered_sum, list);
    2160     4167655 :                 if (sum->bytenr > logical)
    2161             :                         return 0;
    2162     4167593 :                 if (sum->bytenr + sum->len > logical)
    2163             :                         break;
    2164             : 
    2165        9874 :                 ++sctx->stat.csum_discards;
    2166        9874 :                 list_del(&sum->list);
    2167        9874 :                 kfree(sum);
    2168             :                 sum = NULL;
    2169             :         }
    2170     4160688 :         if (!sum)
    2171             :                 return 0;
    2172             : 
    2173     4157724 :         index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
    2174     4157724 :         num_sectors = sum->len / sctx->sectorsize;
    2175     4157724 :         memcpy(csum, sum->sums + index, sctx->csum_size);
    2176     4157724 :         if (index == num_sectors - 1) {
    2177      270282 :                 list_del(&sum->list);
    2178      270257 :                 kfree(sum);
    2179             :         }
    2180             :         return 1;
    2181             : }
    2182             : 
    2183             : /* scrub extent tries to collect up to 64 kB for each bio */
    2184      290439 : static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
    2185             :                         u64 physical, struct btrfs_device *dev, u64 flags,
    2186             :                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
    2187             : {
    2188             :         int ret;
    2189             :         u8 csum[BTRFS_CSUM_SIZE];
    2190             :         u32 blocksize;
    2191             : 
    2192      290439 :         if (flags & BTRFS_EXTENT_FLAG_DATA) {
    2193      278145 :                 blocksize = sctx->sectorsize;
    2194             :                 spin_lock(&sctx->stat_lock);
    2195      278215 :                 sctx->stat.data_extents_scrubbed++;
    2196      278215 :                 sctx->stat.data_bytes_scrubbed += len;
    2197             :                 spin_unlock(&sctx->stat_lock);
    2198       12294 :         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
    2199       12294 :                 WARN_ON(sctx->nodesize != sctx->leafsize);
    2200       12294 :                 blocksize = sctx->nodesize;
    2201             :                 spin_lock(&sctx->stat_lock);
    2202       12294 :                 sctx->stat.tree_extents_scrubbed++;
    2203       12294 :                 sctx->stat.tree_bytes_scrubbed += len;
    2204             :                 spin_unlock(&sctx->stat_lock);
    2205             :         } else {
    2206           0 :                 blocksize = sctx->sectorsize;
    2207           0 :                 WARN_ON(1);
    2208             :         }
    2209             : 
    2210     4464504 :         while (len) {
    2211     4173162 :                 u64 l = min_t(u64, len, blocksize);
    2212             :                 int have_csum = 0;
    2213             : 
    2214     4173162 :                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
    2215             :                         /* push csums to sbio */
    2216     4160825 :                         have_csum = scrub_find_csum(sctx, logical, l, csum);
    2217     4160190 :                         if (have_csum == 0)
    2218        2976 :                                 ++sctx->stat.no_csum;
    2219     4160190 :                         if (sctx->is_dev_replace && !have_csum) {
    2220        1104 :                                 ret = copy_nocow_pages(sctx, logical, l,
    2221             :                                                        mirror_num,
    2222             :                                                       physical_for_dev_replace);
    2223        1104 :                                 goto behind_scrub_pages;
    2224             :                         }
    2225             :                 }
    2226     4171423 :                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
    2227             :                                   mirror_num, have_csum ? csum : NULL, 0,
    2228             :                                   physical_for_dev_replace);
    2229             : behind_scrub_pages:
    2230     4174005 :                 if (ret)
    2231             :                         return ret;
    2232     4174005 :                 len -= l;
    2233     4174005 :                 logical += l;
    2234     4174005 :                 physical += l;
    2235     4174005 :                 physical_for_dev_replace += l;
    2236             :         }
    2237             :         return 0;
    2238             : }
    2239             : 
    2240             : /*
    2241             :  * Given a physical address, this will calculate it's
    2242             :  * logical offset. if this is a parity stripe, it will return
    2243             :  * the most left data stripe's logical offset.
    2244             :  *
    2245             :  * return 0 if it is a data stripe, 1 means parity stripe.
    2246             :  */
    2247           0 : static int get_raid56_logic_offset(u64 physical, int num,
    2248             :                                    struct map_lookup *map, u64 *offset)
    2249             : {
    2250             :         int i;
    2251             :         int j = 0;
    2252             :         u64 stripe_nr;
    2253             :         u64 last_offset;
    2254             :         int stripe_index;
    2255             :         int rot;
    2256             : 
    2257           0 :         last_offset = (physical - map->stripes[num].physical) *
    2258             :                       nr_data_stripes(map);
    2259           0 :         *offset = last_offset;
    2260           0 :         for (i = 0; i < nr_data_stripes(map); i++) {
    2261           0 :                 *offset = last_offset + i * map->stripe_len;
    2262             : 
    2263             :                 stripe_nr = *offset;
    2264           0 :                 do_div(stripe_nr, map->stripe_len);
    2265           0 :                 do_div(stripe_nr, nr_data_stripes(map));
    2266             : 
    2267             :                 /* Work out the disk rotation on this stripe-set */
    2268           0 :                 rot = do_div(stripe_nr, map->num_stripes);
    2269             :                 /* calculate which stripe this data locates */
    2270           0 :                 rot += i;
    2271           0 :                 stripe_index = rot % map->num_stripes;
    2272           0 :                 if (stripe_index == num)
    2273             :                         return 0;
    2274           0 :                 if (stripe_index < num)
    2275           0 :                         j++;
    2276             :         }
    2277           0 :         *offset = last_offset + j * map->stripe_len;
    2278           0 :         return 1;
    2279             : }
    2280             : 
    2281         113 : static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
    2282             :                                            struct map_lookup *map,
    2283             :                                            struct btrfs_device *scrub_dev,
    2284             :                                            int num, u64 base, u64 length,
    2285             :                                            int is_dev_replace)
    2286             : {
    2287             :         struct btrfs_path *path;
    2288        9678 :         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
    2289         113 :         struct btrfs_root *root = fs_info->extent_root;
    2290         113 :         struct btrfs_root *csum_root = fs_info->csum_root;
    2291             :         struct btrfs_extent_item *extent;
    2292             :         struct blk_plug plug;
    2293             :         u64 flags;
    2294             :         int ret;
    2295             :         int slot;
    2296             :         u64 nstripes;
    2297       48725 :         struct extent_buffer *l;
    2298             :         struct btrfs_key key;
    2299             :         u64 physical;
    2300             :         u64 logical;
    2301             :         u64 logic_end;
    2302             :         u64 physical_end;
    2303             :         u64 generation;
    2304             :         int mirror_num;
    2305             :         struct reada_control *reada1;
    2306             :         struct reada_control *reada2;
    2307             :         struct btrfs_key key_start;
    2308             :         struct btrfs_key key_end;
    2309             :         u64 increment = map->stripe_len;
    2310             :         u64 offset;
    2311             :         u64 extent_logical;
    2312             :         u64 extent_physical;
    2313             :         u64 extent_len;
    2314             :         struct btrfs_device *extent_dev;
    2315             :         int extent_mirror_num;
    2316             :         int stop_loop = 0;
    2317             : 
    2318             :         nstripes = length;
    2319         113 :         physical = map->stripes[num].physical;
    2320         113 :         offset = 0;
    2321         113 :         do_div(nstripes, map->stripe_len);
    2322         113 :         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
    2323           9 :                 offset = map->stripe_len * num;
    2324           9 :                 increment = map->stripe_len * map->num_stripes;
    2325             :                 mirror_num = 1;
    2326         104 :         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
    2327           0 :                 int factor = map->num_stripes / map->sub_stripes;
    2328           0 :                 offset = map->stripe_len * (num / map->sub_stripes);
    2329           0 :                 increment = map->stripe_len * factor;
    2330           0 :                 mirror_num = num % map->sub_stripes + 1;
    2331         104 :         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
    2332          30 :                 increment = map->stripe_len;
    2333          30 :                 mirror_num = num % map->num_stripes + 1;
    2334          74 :         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
    2335          24 :                 increment = map->stripe_len;
    2336          24 :                 mirror_num = num % map->num_stripes + 1;
    2337          50 :         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
    2338             :                                 BTRFS_BLOCK_GROUP_RAID6)) {
    2339           0 :                 get_raid56_logic_offset(physical, num, map, &offset);
    2340           0 :                 increment = map->stripe_len * nr_data_stripes(map);
    2341             :                 mirror_num = 1;
    2342             :         } else {
    2343          50 :                 increment = map->stripe_len;
    2344             :                 mirror_num = 1;
    2345             :         }
    2346             : 
    2347         113 :         path = btrfs_alloc_path();
    2348         113 :         if (!path)
    2349             :                 return -ENOMEM;
    2350             : 
    2351             :         /*
    2352             :          * work on commit root. The related disk blocks are static as
    2353             :          * long as COW is applied. This means, it is save to rewrite
    2354             :          * them to repair disk errors without any race conditions
    2355             :          */
    2356         113 :         path->search_commit_root = 1;
    2357         113 :         path->skip_locking = 1;
    2358             : 
    2359             :         /*
    2360             :          * trigger the readahead for extent tree csum tree and wait for
    2361             :          * completion. During readahead, the scrub is officially paused
    2362             :          * to not hold off transaction commits
    2363             :          */
    2364         113 :         logical = base + offset;
    2365         113 :         physical_end = physical + nstripes * map->stripe_len;
    2366         113 :         if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
    2367             :                          BTRFS_BLOCK_GROUP_RAID6)) {
    2368           0 :                 get_raid56_logic_offset(physical_end, num,
    2369             :                                         map, &logic_end);
    2370           0 :                 logic_end += base;
    2371             :         } else {
    2372         113 :                 logic_end = logical + increment * nstripes;
    2373             :         }
    2374         113 :         wait_event(sctx->list_wait,
    2375             :                    atomic_read(&sctx->bios_in_flight) == 0);
    2376         113 :         scrub_blocked_if_needed(fs_info);
    2377             : 
    2378             :         /* FIXME it might be better to start readahead at commit root */
    2379         113 :         key_start.objectid = logical;
    2380         113 :         key_start.type = BTRFS_EXTENT_ITEM_KEY;
    2381         113 :         key_start.offset = (u64)0;
    2382         113 :         key_end.objectid = logic_end;
    2383         113 :         key_end.type = BTRFS_METADATA_ITEM_KEY;
    2384         113 :         key_end.offset = (u64)-1;
    2385         113 :         reada1 = btrfs_reada_add(root, &key_start, &key_end);
    2386             : 
    2387         113 :         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
    2388         113 :         key_start.type = BTRFS_EXTENT_CSUM_KEY;
    2389         113 :         key_start.offset = logical;
    2390         113 :         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
    2391         113 :         key_end.type = BTRFS_EXTENT_CSUM_KEY;
    2392         113 :         key_end.offset = logic_end;
    2393         113 :         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
    2394             : 
    2395         113 :         if (!IS_ERR(reada1))
    2396         113 :                 btrfs_reada_wait(reada1);
    2397         113 :         if (!IS_ERR(reada2))
    2398         113 :                 btrfs_reada_wait(reada2);
    2399             : 
    2400             : 
    2401             :         /*
    2402             :          * collect all data csums for the stripe to avoid seeking during
    2403             :          * the scrub. This might currently (crc32) end up to be about 1MB
    2404             :          */
    2405         113 :         blk_start_plug(&plug);
    2406             : 
    2407             :         /*
    2408             :          * now find all extents for each stripe and scrub them
    2409             :          */
    2410             :         ret = 0;
    2411        9679 :         while (physical < physical_end) {
    2412             :                 /* for raid56, we skip parity stripe */
    2413        9566 :                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
    2414             :                                 BTRFS_BLOCK_GROUP_RAID6)) {
    2415           0 :                         ret = get_raid56_logic_offset(physical, num,
    2416             :                                         map, &logical);
    2417           0 :                         logical += base;
    2418           0 :                         if (ret)
    2419             :                                 goto skip;
    2420             :                 }
    2421             :                 /*
    2422             :                  * canceled?
    2423             :                  */
    2424       19131 :                 if (atomic_read(&fs_info->scrub_cancel_req) ||
    2425             :                     atomic_read(&sctx->cancel_req)) {
    2426             :                         ret = -ECANCELED;
    2427             :                         goto out;
    2428             :                 }
    2429             :                 /*
    2430             :                  * check to see if we have to pause
    2431             :                  */
    2432        9565 :                 if (atomic_read(&fs_info->scrub_pause_req)) {
    2433             :                         /* push queued extents */
    2434             :                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
    2435           0 :                         scrub_submit(sctx);
    2436           0 :                         mutex_lock(&sctx->wr_ctx.wr_lock);
    2437           0 :                         scrub_wr_submit(sctx);
    2438           0 :                         mutex_unlock(&sctx->wr_ctx.wr_lock);
    2439           0 :                         wait_event(sctx->list_wait,
    2440             :                                    atomic_read(&sctx->bios_in_flight) == 0);
    2441             :                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
    2442           0 :                         scrub_blocked_if_needed(fs_info);
    2443             :                 }
    2444             : 
    2445        9565 :                 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
    2446           0 :                         key.type = BTRFS_METADATA_ITEM_KEY;
    2447             :                 else
    2448        9565 :                         key.type = BTRFS_EXTENT_ITEM_KEY;
    2449        9565 :                 key.objectid = logical;
    2450        9565 :                 key.offset = (u64)-1;
    2451             : 
    2452        9565 :                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    2453        9564 :                 if (ret < 0)
    2454             :                         goto out;
    2455             : 
    2456        9564 :                 if (ret > 0) {
    2457        9564 :                         ret = btrfs_previous_extent_item(root, path, 0);
    2458        9565 :                         if (ret < 0)
    2459             :                                 goto out;
    2460        9565 :                         if (ret > 0) {
    2461             :                                 /* there's no smaller item, so stick with the
    2462             :                                  * larger one */
    2463          42 :                                 btrfs_release_path(path);
    2464          42 :                                 ret = btrfs_search_slot(NULL, root, &key,
    2465             :                                                         path, 0, 0);
    2466          42 :                                 if (ret < 0)
    2467             :                                         goto out;
    2468             :                         }
    2469             :                 }
    2470             : 
    2471             :                 stop_loop = 0;
    2472             :                 while (1) {
    2473             :                         u64 bytes;
    2474             : 
    2475       48725 :                         l = path->nodes[0];
    2476       48725 :                         slot = path->slots[0];
    2477       97450 :                         if (slot >= btrfs_header_nritems(l)) {
    2478         393 :                                 ret = btrfs_next_leaf(root, path);
    2479         393 :                                 if (ret == 0)
    2480         373 :                                         continue;
    2481          20 :                                 if (ret < 0)
    2482             :                                         goto out;
    2483             : 
    2484             :                                 stop_loop = 1;
    2485             :                                 break;
    2486             :                         }
    2487       48332 :                         btrfs_item_key_to_cpu(l, &key, slot);
    2488             : 
    2489       48335 :                         if (key.type == BTRFS_METADATA_ITEM_KEY)
    2490           0 :                                 bytes = root->leafsize;
    2491             :                         else
    2492       48335 :                                 bytes = key.offset;
    2493             : 
    2494       48335 :                         if (key.objectid + bytes <= logical)
    2495             :                                 goto next;
    2496             : 
    2497       47282 :                         if (key.type != BTRFS_EXTENT_ITEM_KEY &&
    2498             :                             key.type != BTRFS_METADATA_ITEM_KEY)
    2499             :                                 goto next;
    2500             : 
    2501       47058 :                         if (key.objectid >= logical + map->stripe_len) {
    2502             :                                 /* out of this device extent */
    2503        9545 :                                 if (key.objectid >= logic_end)
    2504             :                                         stop_loop = 1;
    2505             :                                 break;
    2506             :                         }
    2507             : 
    2508       37512 :                         extent = btrfs_item_ptr(l, slot,
    2509             :                                                 struct btrfs_extent_item);
    2510             :                         flags = btrfs_extent_flags(l, extent);
    2511             :                         generation = btrfs_extent_generation(l, extent);
    2512             : 
    2513       37766 :                         if (key.objectid < logical &&
    2514         258 :                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
    2515           0 :                                 btrfs_err(fs_info,
    2516             :                                            "scrub: tree block %llu spanning "
    2517             :                                            "stripes, ignored. logical=%llu",
    2518             :                                        key.objectid, logical);
    2519           0 :                                 goto next;
    2520             :                         }
    2521             : 
    2522             : again:
    2523      290463 :                         extent_logical = key.objectid;
    2524             :                         extent_len = bytes;
    2525             : 
    2526             :                         /*
    2527             :                          * trim extent to this stripe
    2528             :                          */
    2529      290463 :                         if (extent_logical < logical) {
    2530      253210 :                                 extent_len -= logical - extent_logical;
    2531             :                                 extent_logical = logical;
    2532             :                         }
    2533      580926 :                         if (extent_logical + extent_len >
    2534      290463 :                             logical + map->stripe_len) {
    2535      253207 :                                 extent_len = logical + map->stripe_len -
    2536             :                                              extent_logical;
    2537             :                         }
    2538             : 
    2539      290463 :                         extent_physical = extent_logical - logical + physical;
    2540      290463 :                         extent_dev = scrub_dev;
    2541      290463 :                         extent_mirror_num = mirror_num;
    2542      290463 :                         if (is_dev_replace)
    2543      102237 :                                 scrub_remap_extent(fs_info, extent_logical,
    2544             :                                                    extent_len, &extent_physical,
    2545             :                                                    &extent_dev,
    2546             :                                                    &extent_mirror_num);
    2547             : 
    2548      580926 :                         ret = btrfs_lookup_csums_range(csum_root, logical,
    2549      290463 :                                                 logical + map->stripe_len - 1,
    2550             :                                                 &sctx->csum_list, 1);
    2551      290450 :                         if (ret)
    2552             :                                 goto out;
    2553             : 
    2554      290450 :                         ret = scrub_extent(sctx, extent_logical, extent_len,
    2555             :                                            extent_physical, extent_dev, flags,
    2556             :                                            generation, extent_mirror_num,
    2557      290450 :                                            extent_logical - logical + physical);
    2558      290473 :                         if (ret)
    2559             :                                 goto out;
    2560             : 
    2561      290473 :                         scrub_free_csums(sctx);
    2562      580938 :                         if (extent_logical + extent_len <
    2563      290469 :                             key.objectid + bytes) {
    2564      253217 :                                 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
    2565             :                                         BTRFS_BLOCK_GROUP_RAID6)) {
    2566             :                                         /*
    2567             :                                          * loop until we find next data stripe
    2568             :                                          * or we have finished all stripes.
    2569             :                                          */
    2570             :                                         do {
    2571           2 :                                                 physical += map->stripe_len;
    2572           2 :                                                 ret = get_raid56_logic_offset(
    2573             :                                                                 physical, num,
    2574             :                                                                 map, &logical);
    2575           0 :                                                 logical += base;
    2576           0 :                                         } while (physical < physical_end && ret);
    2577             :                                 } else {
    2578      253217 :                                         physical += map->stripe_len;
    2579      253217 :                                         logical += increment;
    2580             :                                 }
    2581      253215 :                                 if (logical < key.objectid + bytes) {
    2582      252956 :                                         cond_resched();
    2583      252955 :                                         goto again;
    2584             :                                 }
    2585             : 
    2586         259 :                                 if (physical >= physical_end) {
    2587             :                                         stop_loop = 1;
    2588             :                                         break;
    2589             :                                 }
    2590             :                         }
    2591             : next:
    2592       38788 :                         path->slots[0]++;
    2593             :                 }
    2594        9565 :                 btrfs_release_path(path);
    2595             : skip:
    2596        9564 :                 logical += increment;
    2597        9564 :                 physical += map->stripe_len;
    2598             :                 spin_lock(&sctx->stat_lock);
    2599        9565 :                 if (stop_loop)
    2600         112 :                         sctx->stat.last_physical = map->stripes[num].physical +
    2601             :                                                    length;
    2602             :                 else
    2603        9453 :                         sctx->stat.last_physical = physical;
    2604             :                 spin_unlock(&sctx->stat_lock);
    2605        9565 :                 if (stop_loop)
    2606             :                         break;
    2607             :         }
    2608             : out:
    2609             :         /* push queued extents */
    2610         114 :         scrub_submit(sctx);
    2611         113 :         mutex_lock(&sctx->wr_ctx.wr_lock);
    2612         113 :         scrub_wr_submit(sctx);
    2613         113 :         mutex_unlock(&sctx->wr_ctx.wr_lock);
    2614             : 
    2615         113 :         blk_finish_plug(&plug);
    2616         113 :         btrfs_free_path(path);
    2617         113 :         return ret < 0 ? ret : 0;
    2618             : }
    2619             : 
    2620         113 : static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
    2621             :                                           struct btrfs_device *scrub_dev,
    2622             :                                           u64 chunk_tree, u64 chunk_objectid,
    2623             :                                           u64 chunk_offset, u64 length,
    2624             :                                           u64 dev_offset, int is_dev_replace)
    2625             : {
    2626             :         struct btrfs_mapping_tree *map_tree =
    2627         113 :                 &sctx->dev_root->fs_info->mapping_tree;
    2628             :         struct map_lookup *map;
    2629             :         struct extent_map *em;
    2630             :         int i;
    2631             :         int ret = 0;
    2632             : 
    2633         113 :         read_lock(&map_tree->map_tree.lock);
    2634         113 :         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
    2635             :         read_unlock(&map_tree->map_tree.lock);
    2636             : 
    2637         113 :         if (!em)
    2638             :                 return -EINVAL;
    2639             : 
    2640         113 :         map = (struct map_lookup *)em->bdev;
    2641         113 :         if (em->start != chunk_offset)
    2642             :                 goto out;
    2643             : 
    2644         113 :         if (em->len < length)
    2645             :                 goto out;
    2646             : 
    2647         175 :         for (i = 0; i < map->num_stripes; ++i) {
    2648         313 :                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
    2649         137 :                     map->stripes[i].physical == dev_offset) {
    2650         113 :                         ret = scrub_stripe(sctx, map, scrub_dev, i,
    2651             :                                            chunk_offset, length,
    2652             :                                            is_dev_replace);
    2653         113 :                         if (ret)
    2654             :                                 goto out;
    2655             :                 }
    2656             :         }
    2657             : out:
    2658         113 :         free_extent_map(em);
    2659             : 
    2660             :         return ret;
    2661             : }
    2662             : 
    2663             : static noinline_for_stack
    2664          19 : int scrub_enumerate_chunks(struct scrub_ctx *sctx,
    2665             :                            struct btrfs_device *scrub_dev, u64 start, u64 end,
    2666             :                            int is_dev_replace)
    2667             : {
    2668             :         struct btrfs_dev_extent *dev_extent = NULL;
    2669             :         struct btrfs_path *path;
    2670          19 :         struct btrfs_root *root = sctx->dev_root;
    2671          19 :         struct btrfs_fs_info *fs_info = root->fs_info;
    2672             :         u64 length;
    2673             :         u64 chunk_tree;
    2674             :         u64 chunk_objectid;
    2675             :         u64 chunk_offset;
    2676             :         int ret;
    2677             :         int slot;
    2678             :         struct extent_buffer *l;
    2679             :         struct btrfs_key key;
    2680             :         struct btrfs_key found_key;
    2681             :         struct btrfs_block_group_cache *cache;
    2682             :         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
    2683             : 
    2684          19 :         path = btrfs_alloc_path();
    2685          19 :         if (!path)
    2686             :                 return -ENOMEM;
    2687             : 
    2688          19 :         path->reada = 2;
    2689          19 :         path->search_commit_root = 1;
    2690          19 :         path->skip_locking = 1;
    2691             : 
    2692          19 :         key.objectid = scrub_dev->devid;
    2693          19 :         key.offset = 0ull;
    2694          19 :         key.type = BTRFS_DEV_EXTENT_KEY;
    2695             : 
    2696             :         while (1) {
    2697         131 :                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    2698         131 :                 if (ret < 0)
    2699             :                         break;
    2700         131 :                 if (ret > 0) {
    2701          46 :                         if (path->slots[0] >=
    2702          23 :                             btrfs_header_nritems(path->nodes[0])) {
    2703          14 :                                 ret = btrfs_next_leaf(root, path);
    2704          14 :                                 if (ret)
    2705             :                                         break;
    2706             :                         }
    2707             :                 }
    2708             : 
    2709         117 :                 l = path->nodes[0];
    2710         117 :                 slot = path->slots[0];
    2711             : 
    2712         117 :                 btrfs_item_key_to_cpu(l, &found_key, slot);
    2713             : 
    2714         117 :                 if (found_key.objectid != scrub_dev->devid)
    2715             :                         break;
    2716             : 
    2717         113 :                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
    2718             :                         break;
    2719             : 
    2720         113 :                 if (found_key.offset >= end)
    2721             :                         break;
    2722             : 
    2723         113 :                 if (found_key.offset < key.offset)
    2724             :                         break;
    2725             : 
    2726         113 :                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
    2727             :                 length = btrfs_dev_extent_length(l, dev_extent);
    2728             : 
    2729         113 :                 if (found_key.offset + length <= start)
    2730             :                         goto skip;
    2731             : 
    2732             :                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
    2733             :                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
    2734             :                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
    2735             : 
    2736             :                 /*
    2737             :                  * get a reference on the corresponding block group to prevent
    2738             :                  * the chunk from going away while we scrub it
    2739             :                  */
    2740         113 :                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
    2741             : 
    2742             :                 /* some chunks are removed but not committed to disk yet,
    2743             :                  * continue scrubbing */
    2744         113 :                 if (!cache)
    2745             :                         goto skip;
    2746             : 
    2747         113 :                 dev_replace->cursor_right = found_key.offset + length;
    2748         113 :                 dev_replace->cursor_left = found_key.offset;
    2749         113 :                 dev_replace->item_needs_writeback = 1;
    2750         113 :                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
    2751             :                                   chunk_offset, length, found_key.offset,
    2752             :                                   is_dev_replace);
    2753             : 
    2754             :                 /*
    2755             :                  * flush, submit all pending read and write bios, afterwards
    2756             :                  * wait for them.
    2757             :                  * Note that in the dev replace case, a read request causes
    2758             :                  * write requests that are submitted in the read completion
    2759             :                  * worker. Therefore in the current situation, it is required
    2760             :                  * that all write requests are flushed, so that all read and
    2761             :                  * write requests are really completed when bios_in_flight
    2762             :                  * changes to 0.
    2763             :                  */
    2764             :                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
    2765         113 :                 scrub_submit(sctx);
    2766         113 :                 mutex_lock(&sctx->wr_ctx.wr_lock);
    2767         113 :                 scrub_wr_submit(sctx);
    2768         113 :                 mutex_unlock(&sctx->wr_ctx.wr_lock);
    2769             : 
    2770        3451 :                 wait_event(sctx->list_wait,
    2771             :                            atomic_read(&sctx->bios_in_flight) == 0);
    2772         113 :                 atomic_inc(&fs_info->scrubs_paused);
    2773         113 :                 wake_up(&fs_info->scrub_pause_wait);
    2774             : 
    2775             :                 /*
    2776             :                  * must be called before we decrease @scrub_paused.
    2777             :                  * make sure we don't block transaction commit while
    2778             :                  * we are waiting pending workers finished.
    2779             :                  */
    2780         800 :                 wait_event(sctx->list_wait,
    2781             :                            atomic_read(&sctx->workers_pending) == 0);
    2782             :                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
    2783             : 
    2784         113 :                 mutex_lock(&fs_info->scrub_lock);
    2785         113 :                 __scrub_blocked_if_needed(fs_info);
    2786             :                 atomic_dec(&fs_info->scrubs_paused);
    2787         113 :                 mutex_unlock(&fs_info->scrub_lock);
    2788         113 :                 wake_up(&fs_info->scrub_pause_wait);
    2789             : 
    2790         113 :                 btrfs_put_block_group(cache);
    2791         113 :                 if (ret)
    2792             :                         break;
    2793         158 :                 if (is_dev_replace &&
    2794             :                     atomic64_read(&dev_replace->num_write_errors) > 0) {
    2795             :                         ret = -EIO;
    2796             :                         break;
    2797             :                 }
    2798         112 :                 if (sctx->stat.malloc_errors > 0) {
    2799             :                         ret = -ENOMEM;
    2800             :                         break;
    2801             :                 }
    2802             : 
    2803         112 :                 dev_replace->cursor_left = dev_replace->cursor_right;
    2804         112 :                 dev_replace->item_needs_writeback = 1;
    2805             : skip:
    2806         112 :                 key.offset = found_key.offset + length;
    2807         112 :                 btrfs_release_path(path);
    2808         112 :         }
    2809             : 
    2810          19 :         btrfs_free_path(path);
    2811             : 
    2812             :         /*
    2813             :          * ret can still be 1 from search_slot or next_leaf,
    2814             :          * that's not an error
    2815             :          */
    2816          19 :         return ret < 0 ? ret : 0;
    2817             : }
    2818             : 
    2819          11 : static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
    2820             :                                            struct btrfs_device *scrub_dev)
    2821             : {
    2822             :         int     i;
    2823             :         u64     bytenr;
    2824             :         u64     gen;
    2825             :         int     ret;
    2826          11 :         struct btrfs_root *root = sctx->dev_root;
    2827             : 
    2828          22 :         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
    2829             :                 return -EIO;
    2830             : 
    2831          11 :         gen = root->fs_info->last_trans_committed;
    2832             : 
    2833          33 :         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    2834             :                 bytenr = btrfs_sb_offset(i);
    2835          33 :                 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
    2836             :                         break;
    2837             : 
    2838          22 :                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
    2839             :                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
    2840             :                                   NULL, 1, bytenr);
    2841          22 :                 if (ret)
    2842             :                         return ret;
    2843             :         }
    2844          48 :         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
    2845             : 
    2846             :         return 0;
    2847             : }
    2848             : 
    2849             : /*
    2850             :  * get a reference count on fs_info->scrub_workers. start worker if necessary
    2851             :  */
    2852          19 : static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
    2853             :                                                 int is_dev_replace)
    2854             : {
    2855             :         int ret = 0;
    2856             :         int flags = WQ_FREEZABLE | WQ_UNBOUND;
    2857          19 :         int max_active = fs_info->thread_pool_size;
    2858             : 
    2859          19 :         if (fs_info->scrub_workers_refcnt == 0) {
    2860          16 :                 if (is_dev_replace)
    2861           8 :                         fs_info->scrub_workers =
    2862           8 :                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
    2863             :                                                       1, 4);
    2864             :                 else
    2865           8 :                         fs_info->scrub_workers =
    2866           8 :                                 btrfs_alloc_workqueue("btrfs-scrub", flags,
    2867             :                                                       max_active, 4);
    2868          16 :                 if (!fs_info->scrub_workers) {
    2869             :                         ret = -ENOMEM;
    2870             :                         goto out;
    2871             :                 }
    2872          16 :                 fs_info->scrub_wr_completion_workers =
    2873          16 :                         btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
    2874             :                                               max_active, 2);
    2875          16 :                 if (!fs_info->scrub_wr_completion_workers) {
    2876             :                         ret = -ENOMEM;
    2877             :                         goto out;
    2878             :                 }
    2879          16 :                 fs_info->scrub_nocow_workers =
    2880          16 :                         btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
    2881          16 :                 if (!fs_info->scrub_nocow_workers) {
    2882             :                         ret = -ENOMEM;
    2883             :                         goto out;
    2884             :                 }
    2885             :         }
    2886          19 :         ++fs_info->scrub_workers_refcnt;
    2887             : out:
    2888          19 :         return ret;
    2889             : }
    2890             : 
    2891          19 : static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
    2892             : {
    2893          19 :         if (--fs_info->scrub_workers_refcnt == 0) {
    2894          16 :                 btrfs_destroy_workqueue(fs_info->scrub_workers);
    2895          16 :                 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
    2896          16 :                 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
    2897             :         }
    2898          19 :         WARN_ON(fs_info->scrub_workers_refcnt < 0);
    2899          19 : }
    2900             : 
    2901          19 : int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
    2902             :                     u64 end, struct btrfs_scrub_progress *progress,
    2903             :                     int readonly, int is_dev_replace)
    2904             : {
    2905             :         struct scrub_ctx *sctx;
    2906             :         int ret;
    2907          19 :         struct btrfs_device *dev;
    2908             :         struct rcu_string *name;
    2909             : 
    2910          19 :         if (btrfs_fs_closing(fs_info))
    2911             :                 return -EINVAL;
    2912             : 
    2913             :         /*
    2914             :          * check some assumptions
    2915             :          */
    2916          19 :         if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
    2917           0 :                 btrfs_err(fs_info,
    2918             :                            "scrub: size assumption nodesize == leafsize (%d == %d) fails",
    2919             :                        fs_info->chunk_root->nodesize,
    2920             :                        fs_info->chunk_root->leafsize);
    2921           0 :                 return -EINVAL;
    2922             :         }
    2923             : 
    2924          19 :         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
    2925             :                 /*
    2926             :                  * in this case scrub is unable to calculate the checksum
    2927             :                  * the way scrub is implemented. Do not handle this
    2928             :                  * situation at all because it won't ever happen.
    2929             :                  */
    2930           0 :                 btrfs_err(fs_info,
    2931             :                            "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
    2932             :                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
    2933           0 :                 return -EINVAL;
    2934             :         }
    2935             : 
    2936          19 :         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
    2937             :                 /* not supported for data w/o checksums */
    2938           0 :                 btrfs_err(fs_info,
    2939             :                            "scrub: size assumption sectorsize != PAGE_SIZE "
    2940             :                            "(%d != %lu) fails",
    2941             :                        fs_info->chunk_root->sectorsize, PAGE_SIZE);
    2942           0 :                 return -EINVAL;
    2943             :         }
    2944             : 
    2945          19 :         if (fs_info->chunk_root->nodesize >
    2946          19 :             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
    2947             :             fs_info->chunk_root->sectorsize >
    2948             :             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
    2949             :                 /*
    2950             :                  * would exhaust the array bounds of pagev member in
    2951             :                  * struct scrub_block
    2952             :                  */
    2953           0 :                 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
    2954             :                            "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
    2955             :                        fs_info->chunk_root->nodesize,
    2956             :                        SCRUB_MAX_PAGES_PER_BLOCK,
    2957             :                        fs_info->chunk_root->sectorsize,
    2958             :                        SCRUB_MAX_PAGES_PER_BLOCK);
    2959           0 :                 return -EINVAL;
    2960             :         }
    2961             : 
    2962             : 
    2963          19 :         mutex_lock(&fs_info->fs_devices->device_list_mutex);
    2964          19 :         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
    2965          19 :         if (!dev || (dev->missing && !is_dev_replace)) {
    2966           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2967           0 :                 return -ENODEV;
    2968             :         }
    2969             : 
    2970          19 :         if (!is_dev_replace && !readonly && !dev->writeable) {
    2971           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2972             :                 rcu_read_lock();
    2973           0 :                 name = rcu_dereference(dev->name);
    2974           0 :                 btrfs_err(fs_info, "scrub: device %s is not writable",
    2975             :                           name->str);
    2976             :                 rcu_read_unlock();
    2977           0 :                 return -EROFS;
    2978             :         }
    2979             : 
    2980          19 :         mutex_lock(&fs_info->scrub_lock);
    2981          19 :         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
    2982           0 :                 mutex_unlock(&fs_info->scrub_lock);
    2983           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2984           0 :                 return -EIO;
    2985             :         }
    2986             : 
    2987          19 :         btrfs_dev_replace_lock(&fs_info->dev_replace);
    2988          19 :         if (dev->scrub_device ||
    2989          11 :             (!is_dev_replace &&
    2990          11 :              btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
    2991           0 :                 btrfs_dev_replace_unlock(&fs_info->dev_replace);
    2992           0 :                 mutex_unlock(&fs_info->scrub_lock);
    2993           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    2994           0 :                 return -EINPROGRESS;
    2995             :         }
    2996          19 :         btrfs_dev_replace_unlock(&fs_info->dev_replace);
    2997             : 
    2998          19 :         ret = scrub_workers_get(fs_info, is_dev_replace);
    2999          19 :         if (ret) {
    3000           0 :                 mutex_unlock(&fs_info->scrub_lock);
    3001           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    3002           0 :                 return ret;
    3003             :         }
    3004             : 
    3005          19 :         sctx = scrub_setup_ctx(dev, is_dev_replace);
    3006          19 :         if (IS_ERR(sctx)) {
    3007           0 :                 mutex_unlock(&fs_info->scrub_lock);
    3008           0 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    3009           0 :                 scrub_workers_put(fs_info);
    3010           0 :                 return PTR_ERR(sctx);
    3011             :         }
    3012          19 :         sctx->readonly = readonly;
    3013          19 :         dev->scrub_device = sctx;
    3014          19 :         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    3015             : 
    3016             :         /*
    3017             :          * checking @scrub_pause_req here, we can avoid
    3018             :          * race between committing transaction and scrubbing.
    3019             :          */
    3020          19 :         __scrub_blocked_if_needed(fs_info);
    3021          19 :         atomic_inc(&fs_info->scrubs_running);
    3022          19 :         mutex_unlock(&fs_info->scrub_lock);
    3023             : 
    3024          19 :         if (!is_dev_replace) {
    3025             :                 /*
    3026             :                  * by holding device list mutex, we can
    3027             :                  * kick off writing super in log tree sync.
    3028             :                  */
    3029          11 :                 mutex_lock(&fs_info->fs_devices->device_list_mutex);
    3030          11 :                 ret = scrub_supers(sctx, dev);
    3031          11 :                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
    3032             :         }
    3033             : 
    3034          19 :         if (!ret)
    3035          19 :                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
    3036             :                                              is_dev_replace);
    3037             : 
    3038          19 :         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
    3039             :         atomic_dec(&fs_info->scrubs_running);
    3040          19 :         wake_up(&fs_info->scrub_pause_wait);
    3041             : 
    3042          19 :         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
    3043             : 
    3044          19 :         if (progress)
    3045          19 :                 memcpy(progress, &sctx->stat, sizeof(*progress));
    3046             : 
    3047          19 :         mutex_lock(&fs_info->scrub_lock);
    3048          19 :         dev->scrub_device = NULL;
    3049          19 :         scrub_workers_put(fs_info);
    3050          19 :         mutex_unlock(&fs_info->scrub_lock);
    3051             : 
    3052          19 :         scrub_free_ctx(sctx);
    3053             : 
    3054          19 :         return ret;
    3055             : }
    3056             : 
    3057        2098 : void btrfs_scrub_pause(struct btrfs_root *root)
    3058             : {
    3059        2098 :         struct btrfs_fs_info *fs_info = root->fs_info;
    3060             : 
    3061        2098 :         mutex_lock(&fs_info->scrub_lock);
    3062        2098 :         atomic_inc(&fs_info->scrub_pause_req);
    3063        2100 :         while (atomic_read(&fs_info->scrubs_paused) !=
    3064             :                atomic_read(&fs_info->scrubs_running)) {
    3065           2 :                 mutex_unlock(&fs_info->scrub_lock);
    3066         113 :                 wait_event(fs_info->scrub_pause_wait,
    3067             :                            atomic_read(&fs_info->scrubs_paused) ==
    3068             :                            atomic_read(&fs_info->scrubs_running));
    3069           2 :                 mutex_lock(&fs_info->scrub_lock);
    3070             :         }
    3071        2098 :         mutex_unlock(&fs_info->scrub_lock);
    3072        2098 : }
    3073             : 
    3074        2098 : void btrfs_scrub_continue(struct btrfs_root *root)
    3075             : {
    3076        2098 :         struct btrfs_fs_info *fs_info = root->fs_info;
    3077             : 
    3078        2098 :         atomic_dec(&fs_info->scrub_pause_req);
    3079        2098 :         wake_up(&fs_info->scrub_pause_wait);
    3080        2098 : }
    3081             : 
    3082         223 : int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
    3083             : {
    3084         223 :         mutex_lock(&fs_info->scrub_lock);
    3085         223 :         if (!atomic_read(&fs_info->scrubs_running)) {
    3086         222 :                 mutex_unlock(&fs_info->scrub_lock);
    3087         222 :                 return -ENOTCONN;
    3088             :         }
    3089             : 
    3090           1 :         atomic_inc(&fs_info->scrub_cancel_req);
    3091           2 :         while (atomic_read(&fs_info->scrubs_running)) {
    3092           1 :                 mutex_unlock(&fs_info->scrub_lock);
    3093           5 :                 wait_event(fs_info->scrub_pause_wait,
    3094             :                            atomic_read(&fs_info->scrubs_running) == 0);
    3095           1 :                 mutex_lock(&fs_info->scrub_lock);
    3096             :         }
    3097             :         atomic_dec(&fs_info->scrub_cancel_req);
    3098           1 :         mutex_unlock(&fs_info->scrub_lock);
    3099             : 
    3100           1 :         return 0;
    3101             : }
    3102             : 
    3103           0 : int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
    3104             :                            struct btrfs_device *dev)
    3105             : {
    3106             :         struct scrub_ctx *sctx;
    3107             : 
    3108           0 :         mutex_lock(&fs_info->scrub_lock);
    3109           0 :         sctx = dev->scrub_device;
    3110           0 :         if (!sctx) {
    3111           0 :                 mutex_unlock(&fs_info->scrub_lock);
    3112           0 :                 return -ENOTCONN;
    3113             :         }
    3114           0 :         atomic_inc(&sctx->cancel_req);
    3115           0 :         while (dev->scrub_device) {
    3116           0 :                 mutex_unlock(&fs_info->scrub_lock);
    3117           0 :                 wait_event(fs_info->scrub_pause_wait,
    3118             :                            dev->scrub_device == NULL);
    3119           0 :                 mutex_lock(&fs_info->scrub_lock);
    3120             :         }
    3121           0 :         mutex_unlock(&fs_info->scrub_lock);
    3122             : 
    3123           0 :         return 0;
    3124             : }
    3125             : 
    3126          12 : int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
    3127             :                          struct btrfs_scrub_progress *progress)
    3128             : {
    3129             :         struct btrfs_device *dev;
    3130             :         struct scrub_ctx *sctx = NULL;
    3131             : 
    3132          12 :         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
    3133          12 :         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
    3134          12 :         if (dev)
    3135          12 :                 sctx = dev->scrub_device;
    3136          12 :         if (sctx)
    3137           3 :                 memcpy(progress, &sctx->stat, sizeof(*progress));
    3138          12 :         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
    3139             : 
    3140          12 :         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
    3141             : }
    3142             : 
    3143      102237 : static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
    3144             :                                u64 extent_logical, u64 extent_len,
    3145             :                                u64 *extent_physical,
    3146             :                                struct btrfs_device **extent_dev,
    3147             :                                int *extent_mirror_num)
    3148             : {
    3149             :         u64 mapped_length;
    3150      102237 :         struct btrfs_bio *bbio = NULL;
    3151             :         int ret;
    3152             : 
    3153      102237 :         mapped_length = extent_len;
    3154      102237 :         ret = btrfs_map_block(fs_info, READ, extent_logical,
    3155             :                               &mapped_length, &bbio, 0);
    3156      204474 :         if (ret || !bbio || mapped_length < extent_len ||
    3157      102237 :             !bbio->stripes[0].dev->bdev) {
    3158           0 :                 kfree(bbio);
    3159      102237 :                 return;
    3160             :         }
    3161             : 
    3162      102237 :         *extent_physical = bbio->stripes[0].physical;
    3163      102237 :         *extent_mirror_num = bbio->mirror_num;
    3164      102237 :         *extent_dev = bbio->stripes[0].dev;
    3165      102237 :         kfree(bbio);
    3166             : }
    3167             : 
    3168          19 : static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
    3169             :                               struct scrub_wr_ctx *wr_ctx,
    3170             :                               struct btrfs_fs_info *fs_info,
    3171             :                               struct btrfs_device *dev,
    3172             :                               int is_dev_replace)
    3173             : {
    3174          19 :         WARN_ON(wr_ctx->wr_curr_bio != NULL);
    3175             : 
    3176          19 :         mutex_init(&wr_ctx->wr_lock);
    3177          19 :         wr_ctx->wr_curr_bio = NULL;
    3178          19 :         if (!is_dev_replace)
    3179             :                 return 0;
    3180             : 
    3181           8 :         WARN_ON(!dev->bdev);
    3182           8 :         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
    3183             :                                          bio_get_nr_vecs(dev->bdev));
    3184           8 :         wr_ctx->tgtdev = dev;
    3185             :         atomic_set(&wr_ctx->flush_all_writes, 0);
    3186             :         return 0;
    3187             : }
    3188             : 
    3189          19 : static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
    3190             : {
    3191          19 :         mutex_lock(&wr_ctx->wr_lock);
    3192          19 :         kfree(wr_ctx->wr_curr_bio);
    3193          19 :         wr_ctx->wr_curr_bio = NULL;
    3194          19 :         mutex_unlock(&wr_ctx->wr_lock);
    3195          19 : }
    3196             : 
    3197        1104 : static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
    3198             :                             int mirror_num, u64 physical_for_dev_replace)
    3199             : {
    3200             :         struct scrub_copy_nocow_ctx *nocow_ctx;
    3201        1104 :         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
    3202             : 
    3203        1104 :         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
    3204        1104 :         if (!nocow_ctx) {
    3205             :                 spin_lock(&sctx->stat_lock);
    3206           0 :                 sctx->stat.malloc_errors++;
    3207             :                 spin_unlock(&sctx->stat_lock);
    3208           0 :                 return -ENOMEM;
    3209             :         }
    3210             : 
    3211        1104 :         scrub_pending_trans_workers_inc(sctx);
    3212             : 
    3213        1104 :         nocow_ctx->sctx = sctx;
    3214        1104 :         nocow_ctx->logical = logical;
    3215        1104 :         nocow_ctx->len = len;
    3216        1104 :         nocow_ctx->mirror_num = mirror_num;
    3217        1104 :         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
    3218        1104 :         btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
    3219             :                         copy_nocow_pages_worker, NULL, NULL);
    3220        1104 :         INIT_LIST_HEAD(&nocow_ctx->inodes);
    3221        1104 :         btrfs_queue_work(fs_info->scrub_nocow_workers,
    3222             :                          &nocow_ctx->work);
    3223             : 
    3224        1104 :         return 0;
    3225             : }
    3226             : 
    3227        1011 : static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
    3228             : {
    3229             :         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
    3230             :         struct scrub_nocow_inode *nocow_inode;
    3231             : 
    3232        1011 :         nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
    3233        1011 :         if (!nocow_inode)
    3234             :                 return -ENOMEM;
    3235        1011 :         nocow_inode->inum = inum;
    3236        1011 :         nocow_inode->offset = offset;
    3237        1011 :         nocow_inode->root = root;
    3238        1011 :         list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
    3239        1011 :         return 0;
    3240             : }
    3241             : 
    3242             : #define COPY_COMPLETE 1
    3243             : 
    3244        1104 : static void copy_nocow_pages_worker(struct btrfs_work *work)
    3245             : {
    3246        1104 :         struct scrub_copy_nocow_ctx *nocow_ctx =
    3247             :                 container_of(work, struct scrub_copy_nocow_ctx, work);
    3248        1104 :         struct scrub_ctx *sctx = nocow_ctx->sctx;
    3249        1104 :         u64 logical = nocow_ctx->logical;
    3250        1104 :         u64 len = nocow_ctx->len;
    3251        1104 :         int mirror_num = nocow_ctx->mirror_num;
    3252        1104 :         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
    3253             :         int ret;
    3254             :         struct btrfs_trans_handle *trans = NULL;
    3255             :         struct btrfs_fs_info *fs_info;
    3256             :         struct btrfs_path *path;
    3257             :         struct btrfs_root *root;
    3258             :         int not_written = 0;
    3259             : 
    3260        1104 :         fs_info = sctx->dev_root->fs_info;
    3261        1104 :         root = fs_info->extent_root;
    3262             : 
    3263        1104 :         path = btrfs_alloc_path();
    3264        1104 :         if (!path) {
    3265             :                 spin_lock(&sctx->stat_lock);
    3266           0 :                 sctx->stat.malloc_errors++;
    3267             :                 spin_unlock(&sctx->stat_lock);
    3268             :                 not_written = 1;
    3269           0 :                 goto out;
    3270             :         }
    3271             : 
    3272        1104 :         trans = btrfs_join_transaction(root);
    3273        1104 :         if (IS_ERR(trans)) {
    3274             :                 not_written = 1;
    3275             :                 goto out;
    3276             :         }
    3277             : 
    3278        1104 :         ret = iterate_inodes_from_logical(logical, fs_info, path,
    3279             :                                           record_inode_for_nocow, nocow_ctx);
    3280        1104 :         if (ret != 0 && ret != -ENOENT) {
    3281           0 :                 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
    3282             :                         "phys %llu, len %llu, mir %u, ret %d",
    3283             :                         logical, physical_for_dev_replace, len, mirror_num,
    3284             :                         ret);
    3285             :                 not_written = 1;
    3286           0 :                 goto out;
    3287             :         }
    3288             : 
    3289        1104 :         btrfs_end_transaction(trans, root);
    3290             :         trans = NULL;
    3291        3312 :         while (!list_empty(&nocow_ctx->inodes)) {
    3292             :                 struct scrub_nocow_inode *entry;
    3293        1011 :                 entry = list_first_entry(&nocow_ctx->inodes,
    3294             :                                          struct scrub_nocow_inode,
    3295             :                                          list);
    3296        1011 :                 list_del_init(&entry->list);
    3297        1011 :                 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
    3298             :                                                  entry->root, nocow_ctx);
    3299        1011 :                 kfree(entry);
    3300        1011 :                 if (ret == COPY_COMPLETE) {
    3301             :                         ret = 0;
    3302             :                         break;
    3303           0 :                 } else if (ret) {
    3304             :                         break;
    3305             :                 }
    3306             :         }
    3307             : out:
    3308        2208 :         while (!list_empty(&nocow_ctx->inodes)) {
    3309             :                 struct scrub_nocow_inode *entry;
    3310           0 :                 entry = list_first_entry(&nocow_ctx->inodes,
    3311             :                                          struct scrub_nocow_inode,
    3312             :                                          list);
    3313           0 :                 list_del_init(&entry->list);
    3314           0 :                 kfree(entry);
    3315             :         }
    3316        1104 :         if (trans && !IS_ERR(trans))
    3317           0 :                 btrfs_end_transaction(trans, root);
    3318        1104 :         if (not_written)
    3319           0 :                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
    3320             :                                             num_uncorrectable_read_errors);
    3321             : 
    3322        1104 :         btrfs_free_path(path);
    3323        1104 :         kfree(nocow_ctx);
    3324             : 
    3325        1104 :         scrub_pending_trans_workers_dec(sctx);
    3326        1104 : }
    3327             : 
    3328        1011 : static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
    3329             :                                       struct scrub_copy_nocow_ctx *nocow_ctx)
    3330             : {
    3331        1011 :         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
    3332             :         struct btrfs_key key;
    3333             :         struct inode *inode;
    3334             :         struct page *page;
    3335             :         struct btrfs_root *local_root;
    3336             :         struct btrfs_ordered_extent *ordered;
    3337             :         struct extent_map *em;
    3338        1011 :         struct extent_state *cached_state = NULL;
    3339             :         struct extent_io_tree *io_tree;
    3340             :         u64 physical_for_dev_replace;
    3341        1011 :         u64 len = nocow_ctx->len;
    3342        1011 :         u64 lockstart = offset, lockend = offset + len - 1;
    3343             :         unsigned long index;
    3344             :         int srcu_index;
    3345             :         int ret = 0;
    3346             :         int err = 0;
    3347             : 
    3348        1011 :         key.objectid = root;
    3349        1011 :         key.type = BTRFS_ROOT_ITEM_KEY;
    3350        1011 :         key.offset = (u64)-1;
    3351             : 
    3352        1011 :         srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
    3353             : 
    3354             :         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
    3355        1011 :         if (IS_ERR(local_root)) {
    3356             :                 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
    3357           0 :                 return PTR_ERR(local_root);
    3358             :         }
    3359             : 
    3360        1011 :         key.type = BTRFS_INODE_ITEM_KEY;
    3361        1011 :         key.objectid = inum;
    3362        1011 :         key.offset = 0;
    3363        1011 :         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
    3364             :         srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
    3365        1011 :         if (IS_ERR(inode))
    3366           0 :                 return PTR_ERR(inode);
    3367             : 
    3368             :         /* Avoid truncate/dio/punch hole.. */
    3369        1011 :         mutex_lock(&inode->i_mutex);
    3370        1011 :         inode_dio_wait(inode);
    3371             : 
    3372        1011 :         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
    3373        1011 :         io_tree = &BTRFS_I(inode)->io_tree;
    3374             : 
    3375        1011 :         lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
    3376        1011 :         ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
    3377        1011 :         if (ordered) {
    3378           0 :                 btrfs_put_ordered_extent(ordered);
    3379           0 :                 goto out_unlock;
    3380             :         }
    3381             : 
    3382        1011 :         em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
    3383        1011 :         if (IS_ERR(em)) {
    3384           0 :                 ret = PTR_ERR(em);
    3385           0 :                 goto out_unlock;
    3386             :         }
    3387             : 
    3388             :         /*
    3389             :          * This extent does not actually cover the logical extent anymore,
    3390             :          * move on to the next inode.
    3391             :          */
    3392        2022 :         if (em->block_start > nocow_ctx->logical ||
    3393        1011 :             em->block_start + em->block_len < nocow_ctx->logical + len) {
    3394           0 :                 free_extent_map(em);
    3395           0 :                 goto out_unlock;
    3396             :         }
    3397        1011 :         free_extent_map(em);
    3398             : 
    3399        3033 :         while (len >= PAGE_CACHE_SIZE) {
    3400        1011 :                 index = offset >> PAGE_CACHE_SHIFT;
    3401             : again:
    3402        1011 :                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
    3403        1011 :                 if (!page) {
    3404           0 :                         btrfs_err(fs_info, "find_or_create_page() failed");
    3405             :                         ret = -ENOMEM;
    3406           0 :                         goto out;
    3407             :                 }
    3408             : 
    3409        1011 :                 if (PageUptodate(page)) {
    3410         947 :                         if (PageDirty(page))
    3411             :                                 goto next_page;
    3412             :                 } else {
    3413             :                         ClearPageError(page);
    3414          64 :                         err = extent_read_full_page_nolock(io_tree, page,
    3415             :                                                            btrfs_get_extent,
    3416             :                                                            nocow_ctx->mirror_num);
    3417          64 :                         if (err) {
    3418             :                                 ret = err;
    3419             :                                 goto next_page;
    3420             :                         }
    3421             : 
    3422          64 :                         lock_page(page);
    3423             :                         /*
    3424             :                          * If the page has been remove from the page cache,
    3425             :                          * the data on it is meaningless, because it may be
    3426             :                          * old one, the new data may be written into the new
    3427             :                          * page in the page cache.
    3428             :                          */
    3429          64 :                         if (page->mapping != inode->i_mapping) {
    3430           0 :                                 unlock_page(page);
    3431           0 :                                 page_cache_release(page);
    3432           0 :                                 goto again;
    3433             :                         }
    3434          64 :                         if (!PageUptodate(page)) {
    3435             :                                 ret = -EIO;
    3436             :                                 goto next_page;
    3437             :                         }
    3438             :                 }
    3439        1011 :                 err = write_page_nocow(nocow_ctx->sctx,
    3440             :                                        physical_for_dev_replace, page);
    3441        1011 :                 if (err)
    3442             :                         ret = err;
    3443             : next_page:
    3444        1011 :                 unlock_page(page);
    3445        1011 :                 page_cache_release(page);
    3446             : 
    3447        1011 :                 if (ret)
    3448             :                         break;
    3449             : 
    3450        1011 :                 offset += PAGE_CACHE_SIZE;
    3451        1011 :                 physical_for_dev_replace += PAGE_CACHE_SIZE;
    3452        1011 :                 len -= PAGE_CACHE_SIZE;
    3453             :         }
    3454             :         ret = COPY_COMPLETE;
    3455             : out_unlock:
    3456        1011 :         unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
    3457             :                              GFP_NOFS);
    3458             : out:
    3459        1011 :         mutex_unlock(&inode->i_mutex);
    3460        1011 :         iput(inode);
    3461        1011 :         return ret;
    3462             : }
    3463             : 
    3464        1011 : static int write_page_nocow(struct scrub_ctx *sctx,
    3465             :                             u64 physical_for_dev_replace, struct page *page)
    3466             : {
    3467             :         struct bio *bio;
    3468             :         struct btrfs_device *dev;
    3469             :         int ret;
    3470             : 
    3471        1011 :         dev = sctx->wr_ctx.tgtdev;
    3472        1011 :         if (!dev)
    3473             :                 return -EIO;
    3474        1011 :         if (!dev->bdev) {
    3475           0 :                 printk_ratelimited(KERN_WARNING
    3476             :                         "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
    3477             :                 return -EIO;
    3478             :         }
    3479        1011 :         bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
    3480        1011 :         if (!bio) {
    3481             :                 spin_lock(&sctx->stat_lock);
    3482           0 :                 sctx->stat.malloc_errors++;
    3483             :                 spin_unlock(&sctx->stat_lock);
    3484           0 :                 return -ENOMEM;
    3485             :         }
    3486        1011 :         bio->bi_iter.bi_size = 0;
    3487        1011 :         bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
    3488        1011 :         bio->bi_bdev = dev->bdev;
    3489        1011 :         ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
    3490        1011 :         if (ret != PAGE_CACHE_SIZE) {
    3491             : leave_with_eio:
    3492           0 :                 bio_put(bio);
    3493           0 :                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
    3494           0 :                 return -EIO;
    3495             :         }
    3496             : 
    3497        1011 :         if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
    3498             :                 goto leave_with_eio;
    3499             : 
    3500        1011 :         bio_put(bio);
    3501        1011 :         return 0;
    3502             : }

Generated by: LCOV version 1.10