LCOV - code coverage report
Current view: top level - fs/btrfs - raid56.c (source / functions) Hit Total Coverage
Test: btrfstest.info Lines: 380 581 65.4 %
Date: 2014-11-28 Functions: 35 44 79.5 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2012 Fusion-io  All rights reserved.
       3             :  * Copyright (C) 2012 Intel Corp. All rights reserved.
       4             :  *
       5             :  * This program is free software; you can redistribute it and/or
       6             :  * modify it under the terms of the GNU General Public
       7             :  * License v2 as published by the Free Software Foundation.
       8             :  *
       9             :  * This program is distributed in the hope that it will be useful,
      10             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      11             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      12             :  * General Public License for more details.
      13             :  *
      14             :  * You should have received a copy of the GNU General Public
      15             :  * License along with this program; if not, write to the
      16             :  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
      17             :  * Boston, MA 021110-1307, USA.
      18             :  */
      19             : #include <linux/sched.h>
      20             : #include <linux/wait.h>
      21             : #include <linux/bio.h>
      22             : #include <linux/slab.h>
      23             : #include <linux/buffer_head.h>
      24             : #include <linux/blkdev.h>
      25             : #include <linux/random.h>
      26             : #include <linux/iocontext.h>
      27             : #include <linux/capability.h>
      28             : #include <linux/ratelimit.h>
      29             : #include <linux/kthread.h>
      30             : #include <linux/raid/pq.h>
      31             : #include <linux/hash.h>
      32             : #include <linux/list_sort.h>
      33             : #include <linux/raid/xor.h>
      34             : #include <linux/vmalloc.h>
      35             : #include <asm/div64.h>
      36             : #include "ctree.h"
      37             : #include "extent_map.h"
      38             : #include "disk-io.h"
      39             : #include "transaction.h"
      40             : #include "print-tree.h"
      41             : #include "volumes.h"
      42             : #include "raid56.h"
      43             : #include "async-thread.h"
      44             : #include "check-integrity.h"
      45             : #include "rcu-string.h"
      46             : 
      47             : /* set when additional merges to this rbio are not allowed */
      48             : #define RBIO_RMW_LOCKED_BIT     1
      49             : 
      50             : /*
      51             :  * set when this rbio is sitting in the hash, but it is just a cache
      52             :  * of past RMW
      53             :  */
      54             : #define RBIO_CACHE_BIT          2
      55             : 
      56             : /*
      57             :  * set when it is safe to trust the stripe_pages for caching
      58             :  */
      59             : #define RBIO_CACHE_READY_BIT    3
      60             : 
      61             : 
      62             : #define RBIO_CACHE_SIZE 1024
      63             : 
      64             : struct btrfs_raid_bio {
      65             :         struct btrfs_fs_info *fs_info;
      66             :         struct btrfs_bio *bbio;
      67             : 
      68             :         /*
      69             :          * logical block numbers for the start of each stripe
      70             :          * The last one or two are p/q.  These are sorted,
      71             :          * so raid_map[0] is the start of our full stripe
      72             :          */
      73             :         u64 *raid_map;
      74             : 
      75             :         /* while we're doing rmw on a stripe
      76             :          * we put it into a hash table so we can
      77             :          * lock the stripe and merge more rbios
      78             :          * into it.
      79             :          */
      80             :         struct list_head hash_list;
      81             : 
      82             :         /*
      83             :          * LRU list for the stripe cache
      84             :          */
      85             :         struct list_head stripe_cache;
      86             : 
      87             :         /*
      88             :          * for scheduling work in the helper threads
      89             :          */
      90             :         struct btrfs_work work;
      91             : 
      92             :         /*
      93             :          * bio list and bio_list_lock are used
      94             :          * to add more bios into the stripe
      95             :          * in hopes of avoiding the full rmw
      96             :          */
      97             :         struct bio_list bio_list;
      98             :         spinlock_t bio_list_lock;
      99             : 
     100             :         /* also protected by the bio_list_lock, the
     101             :          * plug list is used by the plugging code
     102             :          * to collect partial bios while plugged.  The
     103             :          * stripe locking code also uses it to hand off
     104             :          * the stripe lock to the next pending IO
     105             :          */
     106             :         struct list_head plug_list;
     107             : 
     108             :         /*
     109             :          * flags that tell us if it is safe to
     110             :          * merge with this bio
     111             :          */
     112             :         unsigned long flags;
     113             : 
     114             :         /* size of each individual stripe on disk */
     115             :         int stripe_len;
     116             : 
     117             :         /* number of data stripes (no p/q) */
     118             :         int nr_data;
     119             : 
     120             :         /*
     121             :          * set if we're doing a parity rebuild
     122             :          * for a read from higher up, which is handled
     123             :          * differently from a parity rebuild as part of
     124             :          * rmw
     125             :          */
     126             :         int read_rebuild;
     127             : 
     128             :         /* first bad stripe */
     129             :         int faila;
     130             : 
     131             :         /* second bad stripe (for raid6 use) */
     132             :         int failb;
     133             : 
     134             :         /*
     135             :          * number of pages needed to represent the full
     136             :          * stripe
     137             :          */
     138             :         int nr_pages;
     139             : 
     140             :         /*
     141             :          * size of all the bios in the bio_list.  This
     142             :          * helps us decide if the rbio maps to a full
     143             :          * stripe or not
     144             :          */
     145             :         int bio_list_bytes;
     146             : 
     147             :         atomic_t refs;
     148             : 
     149             :         /*
     150             :          * these are two arrays of pointers.  We allocate the
     151             :          * rbio big enough to hold them both and setup their
     152             :          * locations when the rbio is allocated
     153             :          */
     154             : 
     155             :         /* pointers to pages that we allocated for
     156             :          * reading/writing stripes directly from the disk (including P/Q)
     157             :          */
     158             :         struct page **stripe_pages;
     159             : 
     160             :         /*
     161             :          * pointers to the pages in the bio_list.  Stored
     162             :          * here for faster lookup
     163             :          */
     164             :         struct page **bio_pages;
     165             : };
     166             : 
     167             : static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
     168             : static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
     169             : static void rmw_work(struct btrfs_work *work);
     170             : static void read_rebuild_work(struct btrfs_work *work);
     171             : static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
     172             : static void async_read_rebuild(struct btrfs_raid_bio *rbio);
     173             : static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
     174             : static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
     175             : static void __free_raid_bio(struct btrfs_raid_bio *rbio);
     176             : static void index_rbio_pages(struct btrfs_raid_bio *rbio);
     177             : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
     178             : 
     179             : /*
     180             :  * the stripe hash table is used for locking, and to collect
     181             :  * bios in hopes of making a full stripe
     182             :  */
     183         221 : int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
     184             : {
     185             :         struct btrfs_stripe_hash_table *table;
     186             :         struct btrfs_stripe_hash_table *x;
     187             :         struct btrfs_stripe_hash *cur;
     188             :         struct btrfs_stripe_hash *h;
     189             :         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
     190             :         int i;
     191             :         int table_size;
     192             : 
     193         221 :         if (info->stripe_hash_table)
     194             :                 return 0;
     195             : 
     196             :         /*
     197             :          * The table is large, starting with order 4 and can go as high as
     198             :          * order 7 in case lock debugging is turned on.
     199             :          *
     200             :          * Try harder to allocate and fallback to vmalloc to lower the chance
     201             :          * of a failing mount.
     202             :          */
     203             :         table_size = sizeof(*table) + sizeof(*h) * num_entries;
     204         221 :         table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
     205         221 :         if (!table) {
     206           0 :                 table = vzalloc(table_size);
     207           0 :                 if (!table)
     208             :                         return -ENOMEM;
     209             :         }
     210             : 
     211         221 :         spin_lock_init(&table->cache_lock);
     212         221 :         INIT_LIST_HEAD(&table->stripe_cache);
     213             : 
     214         221 :         h = table->table;
     215             : 
     216      452829 :         for (i = 0; i < num_entries; i++) {
     217      452608 :                 cur = h + i;
     218      452608 :                 INIT_LIST_HEAD(&cur->hash_list);
     219      452608 :                 spin_lock_init(&cur->lock);
     220      452608 :                 init_waitqueue_head(&cur->wait);
     221             :         }
     222             : 
     223         221 :         x = cmpxchg(&info->stripe_hash_table, NULL, table);
     224         221 :         if (x) {
     225           0 :                 if (is_vmalloc_addr(x))
     226           0 :                         vfree(x);
     227             :                 else
     228           0 :                         kfree(x);
     229             :         }
     230             :         return 0;
     231             : }
     232             : 
     233             : /*
     234             :  * caching an rbio means to copy anything from the
     235             :  * bio_pages array into the stripe_pages array.  We
     236             :  * use the page uptodate bit in the stripe cache array
     237             :  * to indicate if it has valid data
     238             :  *
     239             :  * once the caching is done, we set the cache ready
     240             :  * bit.
     241             :  */
     242           6 : static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
     243             : {
     244             :         int i;
     245             :         char *s;
     246             :         char *d;
     247             :         int ret;
     248             : 
     249           6 :         ret = alloc_rbio_pages(rbio);
     250           6 :         if (ret)
     251           6 :                 return;
     252             : 
     253         384 :         for (i = 0; i < rbio->nr_pages; i++) {
     254         384 :                 if (!rbio->bio_pages[i])
     255         328 :                         continue;
     256             : 
     257             :                 s = kmap(rbio->bio_pages[i]);
     258          56 :                 d = kmap(rbio->stripe_pages[i]);
     259             : 
     260          56 :                 memcpy(d, s, PAGE_CACHE_SIZE);
     261             : 
     262             :                 kunmap(rbio->bio_pages[i]);
     263             :                 kunmap(rbio->stripe_pages[i]);
     264          56 :                 SetPageUptodate(rbio->stripe_pages[i]);
     265             :         }
     266             :         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
     267             : }
     268             : 
     269             : /*
     270             :  * we hash on the first logical address of the stripe
     271             :  */
     272             : static int rbio_bucket(struct btrfs_raid_bio *rbio)
     273             : {
     274          82 :         u64 num = rbio->raid_map[0];
     275             : 
     276             :         /*
     277             :          * we shift down quite a bit.  We're using byte
     278             :          * addressing, and most of the lower bits are zeros.
     279             :          * This tends to upset hash_64, and it consistently
     280             :          * returns just one or two different values.
     281             :          *
     282             :          * shifting off the lower bits fixes things.
     283             :          */
     284         164 :         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
     285             : }
     286             : 
     287             : /*
     288             :  * stealing an rbio means taking all the uptodate pages from the stripe
     289             :  * array in the source rbio and putting them into the destination rbio
     290             :  */
     291           2 : static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
     292             : {
     293             :         int i;
     294             :         struct page *s;
     295             :         struct page *d;
     296             : 
     297           2 :         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
     298           2 :                 return;
     299             : 
     300         128 :         for (i = 0; i < dest->nr_pages; i++) {
     301         128 :                 s = src->stripe_pages[i];
     302         256 :                 if (!s || !PageUptodate(s)) {
     303           0 :                         continue;
     304             :                 }
     305             : 
     306         128 :                 d = dest->stripe_pages[i];
     307         128 :                 if (d)
     308           0 :                         __free_page(d);
     309             : 
     310         128 :                 dest->stripe_pages[i] = s;
     311         128 :                 src->stripe_pages[i] = NULL;
     312             :         }
     313             : }
     314             : 
     315             : /*
     316             :  * merging means we take the bio_list from the victim and
     317             :  * splice it into the destination.  The victim should
     318             :  * be discarded afterwards.
     319             :  *
     320             :  * must be called with dest->rbio_list_lock held
     321             :  */
     322             : static void merge_rbio(struct btrfs_raid_bio *dest,
     323             :                        struct btrfs_raid_bio *victim)
     324             : {
     325             :         bio_list_merge(&dest->bio_list, &victim->bio_list);
     326           0 :         dest->bio_list_bytes += victim->bio_list_bytes;
     327             :         bio_list_init(&victim->bio_list);
     328             : }
     329             : 
     330             : /*
     331             :  * used to prune items that are in the cache.  The caller
     332             :  * must hold the hash table lock.
     333             :  */
     334           6 : static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
     335             : {
     336             :         int bucket = rbio_bucket(rbio);
     337             :         struct btrfs_stripe_hash_table *table;
     338             :         struct btrfs_stripe_hash *h;
     339             :         int freeit = 0;
     340             : 
     341             :         /*
     342             :          * check the bit again under the hash table lock.
     343             :          */
     344           6 :         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
     345           6 :                 return;
     346             : 
     347           6 :         table = rbio->fs_info->stripe_hash_table;
     348           6 :         h = table->table + bucket;
     349             : 
     350             :         /* hold the lock for the bucket because we may be
     351             :          * removing it from the hash table
     352             :          */
     353             :         spin_lock(&h->lock);
     354             : 
     355             :         /*
     356             :          * hold the lock for the bio list because we need
     357             :          * to make sure the bio list is empty
     358             :          */
     359             :         spin_lock(&rbio->bio_list_lock);
     360             : 
     361          12 :         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
     362           6 :                 list_del_init(&rbio->stripe_cache);
     363           6 :                 table->cache_size -= 1;
     364             :                 freeit = 1;
     365             : 
     366             :                 /* if the bio list isn't empty, this rbio is
     367             :                  * still involved in an IO.  We take it out
     368             :                  * of the cache list, and drop the ref that
     369             :                  * was held for the list.
     370             :                  *
     371             :                  * If the bio_list was empty, we also remove
     372             :                  * the rbio from the hash_table, and drop
     373             :                  * the corresponding ref
     374             :                  */
     375           6 :                 if (bio_list_empty(&rbio->bio_list)) {
     376          12 :                         if (!list_empty(&rbio->hash_list)) {
     377             :                                 list_del_init(&rbio->hash_list);
     378           4 :                                 atomic_dec(&rbio->refs);
     379           8 :                                 BUG_ON(!list_empty(&rbio->plug_list));
     380             :                         }
     381             :                 }
     382             :         }
     383             : 
     384             :         spin_unlock(&rbio->bio_list_lock);
     385             :         spin_unlock(&h->lock);
     386             : 
     387           6 :         if (freeit)
     388           6 :                 __free_raid_bio(rbio);
     389             : }
     390             : 
     391             : /*
     392             :  * prune a given rbio from the cache
     393             :  */
     394          34 : static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
     395             : {
     396             :         struct btrfs_stripe_hash_table *table;
     397             :         unsigned long flags;
     398             : 
     399          34 :         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
     400          34 :                 return;
     401             : 
     402           2 :         table = rbio->fs_info->stripe_hash_table;
     403             : 
     404           2 :         spin_lock_irqsave(&table->cache_lock, flags);
     405           2 :         __remove_rbio_from_cache(rbio);
     406             :         spin_unlock_irqrestore(&table->cache_lock, flags);
     407             : }
     408             : 
     409             : /*
     410             :  * remove everything in the cache
     411             :  */
     412         221 : static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
     413             : {
     414             :         struct btrfs_stripe_hash_table *table;
     415             :         unsigned long flags;
     416             :         struct btrfs_raid_bio *rbio;
     417             : 
     418         221 :         table = info->stripe_hash_table;
     419             : 
     420         221 :         spin_lock_irqsave(&table->cache_lock, flags);
     421         450 :         while (!list_empty(&table->stripe_cache)) {
     422           4 :                 rbio = list_entry(table->stripe_cache.next,
     423             :                                   struct btrfs_raid_bio,
     424             :                                   stripe_cache);
     425           4 :                 __remove_rbio_from_cache(rbio);
     426             :         }
     427             :         spin_unlock_irqrestore(&table->cache_lock, flags);
     428         221 : }
     429             : 
     430             : /*
     431             :  * remove all cached entries and free the hash table
     432             :  * used by unmount
     433             :  */
     434         221 : void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
     435             : {
     436         221 :         if (!info->stripe_hash_table)
     437         221 :                 return;
     438         221 :         btrfs_clear_rbio_cache(info);
     439         442 :         if (is_vmalloc_addr(info->stripe_hash_table))
     440           0 :                 vfree(info->stripe_hash_table);
     441             :         else
     442         221 :                 kfree(info->stripe_hash_table);
     443         221 :         info->stripe_hash_table = NULL;
     444             : }
     445             : 
     446             : /*
     447             :  * insert an rbio into the stripe cache.  It
     448             :  * must have already been prepared by calling
     449             :  * cache_rbio_pages
     450             :  *
     451             :  * If this rbio was already cached, it gets
     452             :  * moved to the front of the lru.
     453             :  *
     454             :  * If the size of the rbio cache is too big, we
     455             :  * prune an item.
     456             :  */
     457          38 : static void cache_rbio(struct btrfs_raid_bio *rbio)
     458             : {
     459             :         struct btrfs_stripe_hash_table *table;
     460             :         unsigned long flags;
     461             : 
     462          38 :         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
     463             :                 return;
     464             : 
     465           6 :         table = rbio->fs_info->stripe_hash_table;
     466             : 
     467           6 :         spin_lock_irqsave(&table->cache_lock, flags);
     468             :         spin_lock(&rbio->bio_list_lock);
     469             : 
     470             :         /* bump our ref if we were not in the list before */
     471          12 :         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
     472           6 :                 atomic_inc(&rbio->refs);
     473             : 
     474          12 :         if (!list_empty(&rbio->stripe_cache)){
     475           0 :                 list_move(&rbio->stripe_cache, &table->stripe_cache);
     476             :         } else {
     477           6 :                 list_add(&rbio->stripe_cache, &table->stripe_cache);
     478           6 :                 table->cache_size += 1;
     479             :         }
     480             : 
     481             :         spin_unlock(&rbio->bio_list_lock);
     482             : 
     483           6 :         if (table->cache_size > RBIO_CACHE_SIZE) {
     484             :                 struct btrfs_raid_bio *found;
     485             : 
     486           0 :                 found = list_entry(table->stripe_cache.prev,
     487             :                                   struct btrfs_raid_bio,
     488             :                                   stripe_cache);
     489             : 
     490           0 :                 if (found != rbio)
     491           0 :                         __remove_rbio_from_cache(found);
     492             :         }
     493             : 
     494             :         spin_unlock_irqrestore(&table->cache_lock, flags);
     495             :         return;
     496             : }
     497             : 
     498             : /*
     499             :  * helper function to run the xor_blocks api.  It is only
     500             :  * able to do MAX_XOR_BLOCKS at a time, so we need to
     501             :  * loop through.
     502             :  */
     503         304 : static void run_xor(void **pages, int src_cnt, ssize_t len)
     504             : {
     505             :         int src_off = 0;
     506             :         int xor_src_cnt = 0;
     507         304 :         void *dest = pages[src_cnt];
     508             : 
     509         912 :         while(src_cnt > 0) {
     510         304 :                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
     511         304 :                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
     512             : 
     513         304 :                 src_cnt -= xor_src_cnt;
     514         304 :                 src_off += xor_src_cnt;
     515             :         }
     516         304 : }
     517             : 
     518             : /*
     519             :  * returns true if the bio list inside this rbio
     520             :  * covers an entire stripe (no rmw required).
     521             :  * Must be called with the bio list lock held, or
     522             :  * at a time when you know it is impossible to add
     523             :  * new bios into the list
     524             :  */
     525             : static int __rbio_is_full(struct btrfs_raid_bio *rbio)
     526             : {
     527          88 :         unsigned long size = rbio->bio_list_bytes;
     528             :         int ret = 1;
     529             : 
     530          88 :         if (size != rbio->nr_data * rbio->stripe_len)
     531             :                 ret = 0;
     532             : 
     533          88 :         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
     534             :         return ret;
     535             : }
     536             : 
     537         176 : static int rbio_is_full(struct btrfs_raid_bio *rbio)
     538             : {
     539             :         unsigned long flags;
     540             :         int ret;
     541             : 
     542          88 :         spin_lock_irqsave(&rbio->bio_list_lock, flags);
     543             :         ret = __rbio_is_full(rbio);
     544             :         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
     545          88 :         return ret;
     546             : }
     547             : 
     548             : /*
     549             :  * returns 1 if it is safe to merge two rbios together.
     550             :  * The merging is safe if the two rbios correspond to
     551             :  * the same stripe and if they are both going in the same
     552             :  * direction (read vs write), and if neither one is
     553             :  * locked for final IO
     554             :  *
     555             :  * The caller is responsible for locking such that
     556             :  * rmw_locked is safe to test
     557             :  */
     558           2 : static int rbio_can_merge(struct btrfs_raid_bio *last,
     559             :                           struct btrfs_raid_bio *cur)
     560             : {
     561           4 :         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
     562             :             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
     563             :                 return 0;
     564             : 
     565             :         /*
     566             :          * we can't merge with cached rbios, since the
     567             :          * idea is that when we merge the destination
     568             :          * rbio is going to run our IO for us.  We can
     569             :          * steal from cached rbio's though, other functions
     570             :          * handle that.
     571             :          */
     572           4 :         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
     573             :             test_bit(RBIO_CACHE_BIT, &cur->flags))
     574             :                 return 0;
     575             : 
     576           4 :         if (last->raid_map[0] !=
     577           2 :             cur->raid_map[0])
     578             :                 return 0;
     579             : 
     580             :         /* reads can't merge with writes */
     581           0 :         if (last->read_rebuild !=
     582           0 :             cur->read_rebuild) {
     583             :                 return 0;
     584             :         }
     585             : 
     586           0 :         return 1;
     587             : }
     588             : 
     589             : /*
     590             :  * helper to index into the pstripe
     591             :  */
     592             : static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
     593             : {
     594         608 :         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
     595         608 :         return rbio->stripe_pages[index];
     596             : }
     597             : 
     598             : /*
     599             :  * helper to index into the qstripe, returns null
     600             :  * if there is no qstripe
     601             :  */
     602             : static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
     603             : {
     604         304 :         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
     605             :                 return NULL;
     606             : 
     607         304 :         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
     608             :                 PAGE_CACHE_SHIFT;
     609         304 :         return rbio->stripe_pages[index];
     610             : }
     611             : 
     612             : /*
     613             :  * The first stripe in the table for a logical address
     614             :  * has the lock.  rbios are added in one of three ways:
     615             :  *
     616             :  * 1) Nobody has the stripe locked yet.  The rbio is given
     617             :  * the lock and 0 is returned.  The caller must start the IO
     618             :  * themselves.
     619             :  *
     620             :  * 2) Someone has the stripe locked, but we're able to merge
     621             :  * with the lock owner.  The rbio is freed and the IO will
     622             :  * start automatically along with the existing rbio.  1 is returned.
     623             :  *
     624             :  * 3) Someone has the stripe locked, but we're not able to merge.
     625             :  * The rbio is added to the lock owner's plug list, or merged into
     626             :  * an rbio already on the plug list.  When the lock owner unlocks,
     627             :  * the next rbio on the list is run and the IO is started automatically.
     628             :  * 1 is returned
     629             :  *
     630             :  * If we return 0, the caller still owns the rbio and must continue with
     631             :  * IO submission.  If we return 1, the caller must assume the rbio has
     632             :  * already been freed.
     633             :  */
     634          38 : static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
     635             : {
     636             :         int bucket = rbio_bucket(rbio);
     637          38 :         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
     638             :         struct btrfs_raid_bio *cur;
     639             :         struct btrfs_raid_bio *pending;
     640             :         unsigned long flags;
     641          76 :         DEFINE_WAIT(wait);
     642             :         struct btrfs_raid_bio *freeit = NULL;
     643             :         struct btrfs_raid_bio *cache_drop = NULL;
     644             :         int ret = 0;
     645             :         int walk = 0;
     646             : 
     647          38 :         spin_lock_irqsave(&h->lock, flags);
     648          38 :         list_for_each_entry(cur, &h->hash_list, hash_list) {
     649             :                 walk++;
     650           2 :                 if (cur->raid_map[0] == rbio->raid_map[0]) {
     651             :                         spin_lock(&cur->bio_list_lock);
     652             : 
     653             :                         /* can we steal this cached rbio's pages? */
     654           4 :                         if (bio_list_empty(&cur->bio_list) &&
     655           4 :                             list_empty(&cur->plug_list) &&
     656           2 :                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
     657             :                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
     658             :                                 list_del_init(&cur->hash_list);
     659           2 :                                 atomic_dec(&cur->refs);
     660             : 
     661           2 :                                 steal_rbio(cur, rbio);
     662             :                                 cache_drop = cur;
     663             :                                 spin_unlock(&cur->bio_list_lock);
     664             : 
     665             :                                 goto lockit;
     666             :                         }
     667             : 
     668             :                         /* can we merge into the lock owner? */
     669           0 :                         if (rbio_can_merge(cur, rbio)) {
     670             :                                 merge_rbio(cur, rbio);
     671             :                                 spin_unlock(&cur->bio_list_lock);
     672             :                                 freeit = rbio;
     673             :                                 ret = 1;
     674           0 :                                 goto out;
     675             :                         }
     676             : 
     677             : 
     678             :                         /*
     679             :                          * we couldn't merge with the running
     680             :                          * rbio, see if we can merge with the
     681             :                          * pending ones.  We don't have to
     682             :                          * check for rmw_locked because there
     683             :                          * is no way they are inside finish_rmw
     684             :                          * right now
     685             :                          */
     686           0 :                         list_for_each_entry(pending, &cur->plug_list,
     687             :                                             plug_list) {
     688           0 :                                 if (rbio_can_merge(pending, rbio)) {
     689             :                                         merge_rbio(pending, rbio);
     690             :                                         spin_unlock(&cur->bio_list_lock);
     691             :                                         freeit = rbio;
     692             :                                         ret = 1;
     693           0 :                                         goto out;
     694             :                                 }
     695             :                         }
     696             : 
     697             :                         /* no merging, put us on the tail of the plug list,
     698             :                          * our rbio will be started with the currently
     699             :                          * running rbio unlocks
     700             :                          */
     701           0 :                         list_add_tail(&rbio->plug_list, &cur->plug_list);
     702             :                         spin_unlock(&cur->bio_list_lock);
     703             :                         ret = 1;
     704           0 :                         goto out;
     705             :                 }
     706             :         }
     707             : lockit:
     708          38 :         atomic_inc(&rbio->refs);
     709          38 :         list_add(&rbio->hash_list, &h->hash_list);
     710             : out:
     711             :         spin_unlock_irqrestore(&h->lock, flags);
     712          38 :         if (cache_drop)
     713           2 :                 remove_rbio_from_cache(cache_drop);
     714          38 :         if (freeit)
     715           0 :                 __free_raid_bio(freeit);
     716          38 :         return ret;
     717             : }
     718             : 
     719             : /*
     720             :  * called as rmw or parity rebuild is completed.  If the plug list has more
     721             :  * rbios waiting for this stripe, the next one on the list will be started
     722             :  */
     723          38 : static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
     724             : {
     725             :         int bucket;
     726             :         struct btrfs_stripe_hash *h;
     727             :         unsigned long flags;
     728             :         int keep_cache = 0;
     729             : 
     730             :         bucket = rbio_bucket(rbio);
     731          38 :         h = rbio->fs_info->stripe_hash_table->table + bucket;
     732             : 
     733          76 :         if (list_empty(&rbio->plug_list))
     734          38 :                 cache_rbio(rbio);
     735             : 
     736          38 :         spin_lock_irqsave(&h->lock, flags);
     737             :         spin_lock(&rbio->bio_list_lock);
     738             : 
     739          76 :         if (!list_empty(&rbio->hash_list)) {
     740             :                 /*
     741             :                  * if we're still cached and there is no other IO
     742             :                  * to perform, just leave this rbio here for others
     743             :                  * to steal from later
     744             :                  */
     745          76 :                 if (list_empty(&rbio->plug_list) &&
     746             :                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
     747             :                         keep_cache = 1;
     748             :                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
     749          12 :                         BUG_ON(!bio_list_empty(&rbio->bio_list));
     750             :                         goto done;
     751             :                 }
     752             : 
     753             :                 list_del_init(&rbio->hash_list);
     754          32 :                 atomic_dec(&rbio->refs);
     755             : 
     756             :                 /*
     757             :                  * we use the plug list to hold all the rbios
     758             :                  * waiting for the chance to lock this stripe.
     759             :                  * hand the lock over to one of them.
     760             :                  */
     761          32 :                 if (!list_empty(&rbio->plug_list)) {
     762             :                         struct btrfs_raid_bio *next;
     763             :                         struct list_head *head = rbio->plug_list.next;
     764             : 
     765           0 :                         next = list_entry(head, struct btrfs_raid_bio,
     766             :                                           plug_list);
     767             : 
     768             :                         list_del_init(&rbio->plug_list);
     769             : 
     770           0 :                         list_add(&next->hash_list, &h->hash_list);
     771           0 :                         atomic_inc(&next->refs);
     772             :                         spin_unlock(&rbio->bio_list_lock);
     773             :                         spin_unlock_irqrestore(&h->lock, flags);
     774             : 
     775           0 :                         if (next->read_rebuild)
     776           0 :                                 async_read_rebuild(next);
     777             :                         else {
     778           0 :                                 steal_rbio(rbio, next);
     779           0 :                                 async_rmw_stripe(next);
     780             :                         }
     781             : 
     782             :                         goto done_nolock;
     783          32 :                 } else  if (waitqueue_active(&h->wait)) {
     784             :                         spin_unlock(&rbio->bio_list_lock);
     785             :                         spin_unlock_irqrestore(&h->lock, flags);
     786           0 :                         wake_up(&h->wait);
     787           0 :                         goto done_nolock;
     788             :                 }
     789             :         }
     790             : done:
     791             :         spin_unlock(&rbio->bio_list_lock);
     792             :         spin_unlock_irqrestore(&h->lock, flags);
     793             : 
     794             : done_nolock:
     795          38 :         if (!keep_cache)
     796          32 :                 remove_rbio_from_cache(rbio);
     797          38 : }
     798             : 
     799          44 : static void __free_raid_bio(struct btrfs_raid_bio *rbio)
     800             : {
     801             :         int i;
     802             : 
     803          44 :         WARN_ON(atomic_read(&rbio->refs) < 0);
     804          88 :         if (!atomic_dec_and_test(&rbio->refs))
     805          44 :                 return;
     806             : 
     807          76 :         WARN_ON(!list_empty(&rbio->stripe_cache));
     808          76 :         WARN_ON(!list_empty(&rbio->hash_list));
     809          76 :         WARN_ON(!bio_list_empty(&rbio->bio_list));
     810             : 
     811        2432 :         for (i = 0; i < rbio->nr_pages; i++) {
     812        2432 :                 if (rbio->stripe_pages[i]) {
     813        1024 :                         __free_page(rbio->stripe_pages[i]);
     814        1024 :                         rbio->stripe_pages[i] = NULL;
     815             :                 }
     816             :         }
     817          38 :         kfree(rbio->raid_map);
     818          38 :         kfree(rbio->bbio);
     819          38 :         kfree(rbio);
     820             : }
     821             : 
     822             : static void free_raid_bio(struct btrfs_raid_bio *rbio)
     823             : {
     824          38 :         unlock_stripe(rbio);
     825          38 :         __free_raid_bio(rbio);
     826             : }
     827             : 
     828             : /*
     829             :  * this frees the rbio and runs through all the bios in the
     830             :  * bio_list and calls end_io on them
     831             :  */
     832          38 : static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
     833             : {
     834             :         struct bio *cur = bio_list_get(&rbio->bio_list);
     835             :         struct bio *next;
     836             :         free_raid_bio(rbio);
     837             : 
     838          76 :         while (cur) {
     839          38 :                 next = cur->bi_next;
     840          38 :                 cur->bi_next = NULL;
     841          38 :                 if (uptodate)
     842             :                         set_bit(BIO_UPTODATE, &cur->bi_flags);
     843          38 :                 bio_endio(cur, err);
     844             :                 cur = next;
     845             :         }
     846          38 : }
     847             : 
     848             : /*
     849             :  * end io function used by finish_rmw.  When we finally
     850             :  * get here, we've written a full stripe
     851             :  */
     852         145 : static void raid_write_end_io(struct bio *bio, int err)
     853             : {
     854         145 :         struct btrfs_raid_bio *rbio = bio->bi_private;
     855             : 
     856         145 :         if (err)
     857           0 :                 fail_bio_stripe(rbio, bio);
     858             : 
     859         145 :         bio_put(bio);
     860             : 
     861         290 :         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
     862             :                 return;
     863             : 
     864             :         err = 0;
     865             : 
     866             :         /* OK, we have read all the stripes we need to. */
     867          76 :         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
     868             :                 err = -EIO;
     869             : 
     870          38 :         rbio_orig_end_io(rbio, err, 0);
     871          38 :         return;
     872             : }
     873             : 
     874             : /*
     875             :  * the read/modify/write code wants to use the original bio for
     876             :  * any pages it included, and then use the rbio for everything
     877             :  * else.  This function decides if a given index (stripe number)
     878             :  * and page number in that stripe fall inside the original bio
     879             :  * or the rbio.
     880             :  *
     881             :  * if you set bio_list_only, you'll get a NULL back for any ranges
     882             :  * that are outside the bio_list
     883             :  *
     884             :  * This doesn't take any refs on anything, you get a bare page pointer
     885             :  * and the caller must bump refs as required.
     886             :  *
     887             :  * You must call index_rbio_pages once before you can trust
     888             :  * the answers from this function.
     889             :  */
     890        5712 : static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
     891             :                                  int index, int pagenr, int bio_list_only)
     892             : {
     893             :         int chunk_page;
     894             :         struct page *p = NULL;
     895             : 
     896        5712 :         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
     897             : 
     898             :         spin_lock_irq(&rbio->bio_list_lock);
     899        5712 :         p = rbio->bio_pages[chunk_page];
     900             :         spin_unlock_irq(&rbio->bio_list_lock);
     901             : 
     902        5712 :         if (p || bio_list_only)
     903             :                 return p;
     904             : 
     905        1280 :         return rbio->stripe_pages[chunk_page];
     906             : }
     907             : 
     908             : /*
     909             :  * number of pages we need for the entire stripe across all the
     910             :  * drives
     911             :  */
     912             : static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
     913             : {
     914          38 :         unsigned long nr = stripe_len * nr_stripes;
     915          38 :         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
     916             : }
     917             : 
     918             : /*
     919             :  * allocation and initial setup for the btrfs_raid_bio.  Not
     920             :  * this does not allocate any pages for rbio->pages.
     921             :  */
     922          38 : static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
     923             :                           struct btrfs_bio *bbio, u64 *raid_map,
     924             :                           u64 stripe_len)
     925             : {
     926             :         struct btrfs_raid_bio *rbio;
     927             :         int nr_data = 0;
     928          76 :         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
     929             :         void *p;
     930             : 
     931          38 :         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
     932             :                         GFP_NOFS);
     933          38 :         if (!rbio) {
     934           0 :                 kfree(raid_map);
     935           0 :                 kfree(bbio);
     936             :                 return ERR_PTR(-ENOMEM);
     937             :         }
     938             : 
     939             :         bio_list_init(&rbio->bio_list);
     940          38 :         INIT_LIST_HEAD(&rbio->plug_list);
     941          38 :         spin_lock_init(&rbio->bio_list_lock);
     942          38 :         INIT_LIST_HEAD(&rbio->stripe_cache);
     943          38 :         INIT_LIST_HEAD(&rbio->hash_list);
     944          38 :         rbio->bbio = bbio;
     945          38 :         rbio->raid_map = raid_map;
     946          38 :         rbio->fs_info = root->fs_info;
     947          38 :         rbio->stripe_len = stripe_len;
     948          38 :         rbio->nr_pages = num_pages;
     949          38 :         rbio->faila = -1;
     950          38 :         rbio->failb = -1;
     951             :         atomic_set(&rbio->refs, 1);
     952             : 
     953             :         /*
     954             :          * the stripe_pages and bio_pages array point to the extra
     955             :          * memory we allocated past the end of the rbio
     956             :          */
     957          38 :         p = rbio + 1;
     958          38 :         rbio->stripe_pages = p;
     959          38 :         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
     960             : 
     961          38 :         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
     962          19 :                 nr_data = bbio->num_stripes - 2;
     963             :         else
     964          19 :                 nr_data = bbio->num_stripes - 1;
     965             : 
     966          38 :         rbio->nr_data = nr_data;
     967             :         return rbio;
     968             : }
     969             : 
     970             : /* allocate pages for all the stripes in the bio, including parity */
     971          12 : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
     972             : {
     973             :         int i;
     974             :         struct page *page;
     975             : 
     976         768 :         for (i = 0; i < rbio->nr_pages; i++) {
     977         768 :                 if (rbio->stripe_pages[i])
     978         512 :                         continue;
     979             :                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
     980         246 :                 if (!page)
     981             :                         return -ENOMEM;
     982         247 :                 rbio->stripe_pages[i] = page;
     983             :                 ClearPageUptodate(page);
     984             :         }
     985             :         return 0;
     986             : }
     987             : 
     988             : /* allocate pages for just the p/q stripes */
     989          32 : static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
     990             : {
     991             :         int i;
     992             :         struct page *page;
     993             : 
     994          32 :         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
     995             : 
     996         800 :         for (; i < rbio->nr_pages; i++) {
     997         768 :                 if (rbio->stripe_pages[i])
     998           0 :                         continue;
     999             :                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
    1000         768 :                 if (!page)
    1001             :                         return -ENOMEM;
    1002         768 :                 rbio->stripe_pages[i] = page;
    1003             :         }
    1004             :         return 0;
    1005             : }
    1006             : 
    1007             : /*
    1008             :  * add a single page from a specific stripe into our list of bios for IO
    1009             :  * this will try to merge into existing bios if possible, and returns
    1010             :  * zero if all went well.
    1011             :  */
    1012        2372 : static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
    1013             :                             struct bio_list *bio_list,
    1014             :                             struct page *page,
    1015             :                             int stripe_nr,
    1016             :                             unsigned long page_index,
    1017             :                             unsigned long bio_max_len)
    1018             : {
    1019        2372 :         struct bio *last = bio_list->tail;
    1020             :         u64 last_end = 0;
    1021             :         int ret;
    1022             :         struct bio *bio;
    1023             :         struct btrfs_bio_stripe *stripe;
    1024             :         u64 disk_start;
    1025             : 
    1026        2372 :         stripe = &rbio->bbio->stripes[stripe_nr];
    1027        2372 :         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
    1028             : 
    1029             :         /* if the device is missing, just fail this stripe */
    1030        2372 :         if (!stripe->dev->bdev)
    1031           0 :                 return fail_rbio_index(rbio, stripe_nr);
    1032             : 
    1033             :         /* see if we can add this page onto our existing bio */
    1034        2372 :         if (last) {
    1035        2330 :                 last_end = (u64)last->bi_iter.bi_sector << 9;
    1036        2330 :                 last_end += last->bi_iter.bi_size;
    1037             : 
    1038             :                 /*
    1039             :                  * we can't merge these if they are from different
    1040             :                  * devices or if they are not contiguous
    1041             :                  */
    1042        4548 :                 if (last_end == disk_start && stripe->dev->bdev &&
    1043        2218 :                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
    1044        2218 :                     last->bi_bdev == stripe->dev->bdev) {
    1045        2217 :                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
    1046        2217 :                         if (ret == PAGE_CACHE_SIZE)
    1047             :                                 return 0;
    1048             :                 }
    1049             :         }
    1050             : 
    1051             :         /* put a new bio on the list */
    1052         155 :         bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
    1053         155 :         if (!bio)
    1054             :                 return -ENOMEM;
    1055             : 
    1056         155 :         bio->bi_iter.bi_size = 0;
    1057         155 :         bio->bi_bdev = stripe->dev->bdev;
    1058         155 :         bio->bi_iter.bi_sector = disk_start >> 9;
    1059             :         set_bit(BIO_UPTODATE, &bio->bi_flags);
    1060             : 
    1061         155 :         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
    1062             :         bio_list_add(bio_list, bio);
    1063         155 :         return 0;
    1064             : }
    1065             : 
    1066             : /*
    1067             :  * while we're doing the read/modify/write cycle, we could
    1068             :  * have errors in reading pages off the disk.  This checks
    1069             :  * for errors and if we're not able to read the page it'll
    1070             :  * trigger parity reconstruction.  The rmw will be finished
    1071             :  * after we've reconstructed the failed stripes
    1072             :  */
    1073           6 : static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
    1074             : {
    1075           6 :         if (rbio->faila >= 0 || rbio->failb >= 0) {
    1076           0 :                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
    1077           0 :                 __raid56_parity_recover(rbio);
    1078             :         } else {
    1079           6 :                 finish_rmw(rbio);
    1080             :         }
    1081           6 : }
    1082             : 
    1083             : /*
    1084             :  * these are just the pages from the rbio array, not from anything
    1085             :  * the FS sent down to us
    1086             :  */
    1087             : static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
    1088             : {
    1089             :         int index;
    1090        1096 :         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
    1091        1096 :         index += page;
    1092        1096 :         return rbio->stripe_pages[index];
    1093             : }
    1094             : 
    1095             : /*
    1096             :  * helper function to walk our bio list and populate the bio_pages array with
    1097             :  * the result.  This seems expensive, but it is faster than constantly
    1098             :  * searching through the bio list as we setup the IO in finish_rmw or stripe
    1099             :  * reconstruction.
    1100             :  *
    1101             :  * This must be called before you trust the answers from page_in_rbio
    1102             :  */
    1103          44 : static void index_rbio_pages(struct btrfs_raid_bio *rbio)
    1104             : {
    1105             :         struct bio *bio;
    1106             :         u64 start;
    1107             :         unsigned long stripe_offset;
    1108             :         unsigned long page_index;
    1109             :         struct page *p;
    1110             :         int i;
    1111             : 
    1112             :         spin_lock_irq(&rbio->bio_list_lock);
    1113          88 :         bio_list_for_each(bio, &rbio->bio_list) {
    1114          44 :                 start = (u64)bio->bi_iter.bi_sector << 9;
    1115          44 :                 stripe_offset = start - rbio->raid_map[0];
    1116          44 :                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
    1117             : 
    1118        1436 :                 for (i = 0; i < bio->bi_vcnt; i++) {
    1119        1392 :                         p = bio->bi_io_vec[i].bv_page;
    1120        1392 :                         rbio->bio_pages[page_index + i] = p;
    1121             :                 }
    1122             :         }
    1123             :         spin_unlock_irq(&rbio->bio_list_lock);
    1124          44 : }
    1125             : 
    1126             : /*
    1127             :  * this is called from one of two situations.  We either
    1128             :  * have a full stripe from the higher layers, or we've read all
    1129             :  * the missing bits off disk.
    1130             :  *
    1131             :  * This will calculate the parity and then send down any
    1132             :  * changed blocks.
    1133             :  */
    1134        1558 : static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
    1135             : {
    1136          38 :         struct btrfs_bio *bbio = rbio->bbio;
    1137          38 :         void *pointers[bbio->num_stripes];
    1138          38 :         int stripe_len = rbio->stripe_len;
    1139          38 :         int nr_data = rbio->nr_data;
    1140             :         int stripe;
    1141             :         int pagenr;
    1142             :         int p_stripe = -1;
    1143             :         int q_stripe = -1;
    1144             :         struct bio_list bio_list;
    1145             :         struct bio *bio;
    1146          38 :         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
    1147             :         int ret;
    1148             : 
    1149             :         bio_list_init(&bio_list);
    1150             : 
    1151          38 :         if (bbio->num_stripes - rbio->nr_data == 1) {
    1152             :                 p_stripe = bbio->num_stripes - 1;
    1153          19 :         } else if (bbio->num_stripes - rbio->nr_data == 2) {
    1154             :                 p_stripe = bbio->num_stripes - 2;
    1155          19 :                 q_stripe = bbio->num_stripes - 1;
    1156             :         } else {
    1157           0 :                 BUG();
    1158             :         }
    1159             : 
    1160             :         /* at this point we either have a full stripe,
    1161             :          * or we've read the full stripe from the drive.
    1162             :          * recalculate the parity and write the new results.
    1163             :          *
    1164             :          * We're not allowed to add any new bios to the
    1165             :          * bio list here, anyone else that wants to
    1166             :          * change this stripe needs to do their own rmw.
    1167             :          */
    1168             :         spin_lock_irq(&rbio->bio_list_lock);
    1169             :         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
    1170             :         spin_unlock_irq(&rbio->bio_list_lock);
    1171             : 
    1172          38 :         atomic_set(&rbio->bbio->error, 0);
    1173             : 
    1174             :         /*
    1175             :          * now that we've set rmw_locked, run through the
    1176             :          * bio list one last time and map the page pointers
    1177             :          *
    1178             :          * We don't cache full rbios because we're assuming
    1179             :          * the higher layers are unlikely to use this area of
    1180             :          * the disk again soon.  If they do use it again,
    1181             :          * hopefully they will send another full bio.
    1182             :          */
    1183          38 :         index_rbio_pages(rbio);
    1184          38 :         if (!rbio_is_full(rbio))
    1185           6 :                 cache_rbio_pages(rbio);
    1186             :         else
    1187             :                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
    1188             : 
    1189         608 :         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
    1190             :                 struct page *p;
    1191             :                 /* first collect one page from each data stripe */
    1192        1520 :                 for (stripe = 0; stripe < nr_data; stripe++) {
    1193        1520 :                         p = page_in_rbio(rbio, stripe, pagenr, 0);
    1194        1520 :                         pointers[stripe] = kmap(p);
    1195             :                 }
    1196             : 
    1197             :                 /* then add the parity stripe */
    1198             :                 p = rbio_pstripe_page(rbio, pagenr);
    1199             :                 SetPageUptodate(p);
    1200        1216 :                 pointers[stripe++] = kmap(p);
    1201             : 
    1202         608 :                 if (q_stripe != -1) {
    1203             : 
    1204             :                         /*
    1205             :                          * raid6, add the qstripe and call the
    1206             :                          * library function to fill in our p/q
    1207             :                          */
    1208             :                         p = rbio_qstripe_page(rbio, pagenr);
    1209             :                         SetPageUptodate(p);
    1210         304 :                         pointers[stripe++] = kmap(p);
    1211             : 
    1212         304 :                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
    1213             :                                                 pointers);
    1214             :                 } else {
    1215             :                         /* raid5 */
    1216         304 :                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
    1217         304 :                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
    1218             :                 }
    1219             : 
    1220             : 
    1221        2432 :                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
    1222        2432 :                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
    1223             :         }
    1224             : 
    1225             :         /*
    1226             :          * time to start writing.  Make bios for everything from the
    1227             :          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
    1228             :          * everything else.
    1229             :          */
    1230         152 :         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
    1231        2432 :                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
    1232             :                         struct page *page;
    1233        2432 :                         if (stripe < rbio->nr_data) {
    1234        1520 :                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
    1235        1520 :                                 if (!page)
    1236         184 :                                         continue;
    1237             :                         } else {
    1238             :                                page = rbio_stripe_page(rbio, stripe, pagenr);
    1239             :                         }
    1240             : 
    1241        2248 :                         ret = rbio_add_io_page(rbio, &bio_list,
    1242        2248 :                                        page, stripe, pagenr, rbio->stripe_len);
    1243        2248 :                         if (ret)
    1244             :                                 goto cleanup;
    1245             :                 }
    1246             :         }
    1247             : 
    1248          76 :         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
    1249          38 :         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
    1250             : 
    1251             :         while (1) {
    1252             :                 bio = bio_list_pop(&bio_list);
    1253         183 :                 if (!bio)
    1254             :                         break;
    1255             : 
    1256         145 :                 bio->bi_private = rbio;
    1257         145 :                 bio->bi_end_io = raid_write_end_io;
    1258         145 :                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
    1259         145 :                 submit_bio(WRITE, bio);
    1260         145 :         }
    1261          38 :         return;
    1262             : 
    1263             : cleanup:
    1264           0 :         rbio_orig_end_io(rbio, -EIO, 0);
    1265             : }
    1266             : 
    1267             : /*
    1268             :  * helper to find the stripe number for a given bio.  Used to figure out which
    1269             :  * stripe has failed.  This expects the bio to correspond to a physical disk,
    1270             :  * so it looks up based on physical sector numbers.
    1271             :  */
    1272             : static int find_bio_stripe(struct btrfs_raid_bio *rbio,
    1273             :                            struct bio *bio)
    1274             : {
    1275             :         u64 physical = bio->bi_iter.bi_sector;
    1276             :         u64 stripe_start;
    1277             :         int i;
    1278             :         struct btrfs_bio_stripe *stripe;
    1279             : 
    1280           0 :         physical <<= 9;
    1281             : 
    1282           0 :         for (i = 0; i < rbio->bbio->num_stripes; i++) {
    1283           0 :                 stripe = &rbio->bbio->stripes[i];
    1284           0 :                 stripe_start = stripe->physical;
    1285           0 :                 if (physical >= stripe_start &&
    1286           0 :                     physical < stripe_start + rbio->stripe_len) {
    1287             :                         return i;
    1288             :                 }
    1289             :         }
    1290             :         return -1;
    1291             : }
    1292             : 
    1293             : /*
    1294             :  * helper to find the stripe number for a given
    1295             :  * bio (before mapping).  Used to figure out which stripe has
    1296             :  * failed.  This looks up based on logical block numbers.
    1297             :  */
    1298             : static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
    1299             :                                    struct bio *bio)
    1300             : {
    1301             :         u64 logical = bio->bi_iter.bi_sector;
    1302             :         u64 stripe_start;
    1303             :         int i;
    1304             : 
    1305           0 :         logical <<= 9;
    1306             : 
    1307           0 :         for (i = 0; i < rbio->nr_data; i++) {
    1308           0 :                 stripe_start = rbio->raid_map[i];
    1309           0 :                 if (logical >= stripe_start &&
    1310           0 :                     logical < stripe_start + rbio->stripe_len) {
    1311             :                         return i;
    1312             :                 }
    1313             :         }
    1314             :         return -1;
    1315             : }
    1316             : 
    1317             : /*
    1318             :  * returns -EIO if we had too many failures
    1319             :  */
    1320           0 : static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
    1321             : {
    1322             :         unsigned long flags;
    1323             :         int ret = 0;
    1324             : 
    1325           0 :         spin_lock_irqsave(&rbio->bio_list_lock, flags);
    1326             : 
    1327             :         /* we already know this stripe is bad, move on */
    1328           0 :         if (rbio->faila == failed || rbio->failb == failed)
    1329             :                 goto out;
    1330             : 
    1331           0 :         if (rbio->faila == -1) {
    1332             :                 /* first failure on this rbio */
    1333           0 :                 rbio->faila = failed;
    1334           0 :                 atomic_inc(&rbio->bbio->error);
    1335           0 :         } else if (rbio->failb == -1) {
    1336             :                 /* second failure on this rbio */
    1337           0 :                 rbio->failb = failed;
    1338           0 :                 atomic_inc(&rbio->bbio->error);
    1339             :         } else {
    1340             :                 ret = -EIO;
    1341             :         }
    1342             : out:
    1343             :         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
    1344             : 
    1345           0 :         return ret;
    1346             : }
    1347             : 
    1348             : /*
    1349             :  * helper to fail a stripe based on a physical disk
    1350             :  * bio.
    1351             :  */
    1352           0 : static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
    1353           0 :                            struct bio *bio)
    1354             : {
    1355             :         int failed = find_bio_stripe(rbio, bio);
    1356             : 
    1357           0 :         if (failed < 0)
    1358             :                 return -EIO;
    1359             : 
    1360           0 :         return fail_rbio_index(rbio, failed);
    1361             : }
    1362             : 
    1363             : /*
    1364             :  * this sets each page in the bio uptodate.  It should only be used on private
    1365             :  * rbio pages, nothing that comes in from the higher layers
    1366             :  */
    1367             : static void set_bio_pages_uptodate(struct bio *bio)
    1368             : {
    1369             :         int i;
    1370             :         struct page *p;
    1371             : 
    1372         124 :         for (i = 0; i < bio->bi_vcnt; i++) {
    1373         124 :                 p = bio->bi_io_vec[i].bv_page;
    1374             :                 SetPageUptodate(p);
    1375             :         }
    1376             : }
    1377             : 
    1378             : /*
    1379             :  * end io for the read phase of the rmw cycle.  All the bios here are physical
    1380             :  * stripe bios we've read from the disk so we can recalculate the parity of the
    1381             :  * stripe.
    1382             :  *
    1383             :  * This will usually kick off finish_rmw once all the bios are read in, but it
    1384             :  * may trigger parity reconstruction if we had any errors along the way
    1385             :  */
    1386          10 : static void raid_rmw_end_io(struct bio *bio, int err)
    1387             : {
    1388          10 :         struct btrfs_raid_bio *rbio = bio->bi_private;
    1389             : 
    1390          10 :         if (err)
    1391           0 :                 fail_bio_stripe(rbio, bio);
    1392             :         else
    1393             :                 set_bio_pages_uptodate(bio);
    1394             : 
    1395          10 :         bio_put(bio);
    1396             : 
    1397          20 :         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
    1398             :                 return;
    1399             : 
    1400             :         err = 0;
    1401           8 :         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
    1402             :                 goto cleanup;
    1403             : 
    1404             :         /*
    1405             :          * this will normally call finish_rmw to start our write
    1406             :          * but if there are any failed stripes we'll reconstruct
    1407             :          * from parity first
    1408             :          */
    1409           4 :         validate_rbio_for_rmw(rbio);
    1410           4 :         return;
    1411             : 
    1412             : cleanup:
    1413             : 
    1414           0 :         rbio_orig_end_io(rbio, -EIO, 0);
    1415             : }
    1416             : 
    1417           6 : static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
    1418             : {
    1419           6 :         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
    1420             :                         rmw_work, NULL, NULL);
    1421             : 
    1422           6 :         btrfs_queue_work(rbio->fs_info->rmw_workers,
    1423             :                          &rbio->work);
    1424           6 : }
    1425             : 
    1426           0 : static void async_read_rebuild(struct btrfs_raid_bio *rbio)
    1427             : {
    1428           0 :         btrfs_init_work(&rbio->work, btrfs_rmw_helper,
    1429             :                         read_rebuild_work, NULL, NULL);
    1430             : 
    1431           0 :         btrfs_queue_work(rbio->fs_info->rmw_workers,
    1432             :                          &rbio->work);
    1433           0 : }
    1434             : 
    1435             : /*
    1436             :  * the stripe must be locked by the caller.  It will
    1437             :  * unlock after all the writes are done
    1438             :  */
    1439         190 : static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
    1440             : {
    1441             :         int bios_to_read = 0;
    1442           6 :         struct btrfs_bio *bbio = rbio->bbio;
    1443             :         struct bio_list bio_list;
    1444             :         int ret;
    1445           6 :         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
    1446             :         int pagenr;
    1447             :         int stripe;
    1448             :         struct bio *bio;
    1449             : 
    1450             :         bio_list_init(&bio_list);
    1451             : 
    1452           6 :         ret = alloc_rbio_pages(rbio);
    1453           6 :         if (ret)
    1454             :                 goto cleanup;
    1455             : 
    1456           6 :         index_rbio_pages(rbio);
    1457             : 
    1458           6 :         atomic_set(&rbio->bbio->error, 0);
    1459             :         /*
    1460             :          * build a list of bios to read all the missing parts of this
    1461             :          * stripe
    1462             :          */
    1463          21 :         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
    1464         240 :                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
    1465             :                         struct page *page;
    1466             :                         /*
    1467             :                          * we want to find all the pages missing from
    1468             :                          * the rbio and read them from the disk.  If
    1469             :                          * page_in_rbio finds a page in the bio list
    1470             :                          * we don't need to read it off the stripe.
    1471             :                          */
    1472         240 :                         page = page_in_rbio(rbio, stripe, pagenr, 1);
    1473         240 :                         if (page)
    1474          56 :                                 continue;
    1475             : 
    1476             :                         page = rbio_stripe_page(rbio, stripe, pagenr);
    1477             :                         /*
    1478             :                          * the bio cache may have handed us an uptodate
    1479             :                          * page.  If so, be happy and use it
    1480             :                          */
    1481         184 :                         if (PageUptodate(page))
    1482          60 :                                 continue;
    1483             : 
    1484         124 :                         ret = rbio_add_io_page(rbio, &bio_list, page,
    1485         124 :                                        stripe, pagenr, rbio->stripe_len);
    1486         124 :                         if (ret)
    1487             :                                 goto cleanup;
    1488             :                 }
    1489             :         }
    1490             : 
    1491          12 :         bios_to_read = bio_list_size(&bio_list);
    1492           6 :         if (!bios_to_read) {
    1493             :                 /*
    1494             :                  * this can happen if others have merged with
    1495             :                  * us, it means there is nothing left to read.
    1496             :                  * But if there are missing devices it may not be
    1497             :                  * safe to do the full stripe write yet.
    1498             :                  */
    1499             :                 goto finish;
    1500             :         }
    1501             : 
    1502             :         /*
    1503             :          * the bbio may be freed once we submit the last bio.  Make sure
    1504             :          * not to touch it after that
    1505             :          */
    1506             :         atomic_set(&bbio->stripes_pending, bios_to_read);
    1507             :         while (1) {
    1508             :                 bio = bio_list_pop(&bio_list);
    1509          14 :                 if (!bio)
    1510             :                         break;
    1511             : 
    1512          10 :                 bio->bi_private = rbio;
    1513          10 :                 bio->bi_end_io = raid_rmw_end_io;
    1514             : 
    1515          10 :                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
    1516             :                                     BTRFS_WQ_ENDIO_RAID56);
    1517             : 
    1518          10 :                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
    1519          10 :                 submit_bio(READ, bio);
    1520          10 :         }
    1521             :         /* the actual write will happen once the reads are done */
    1522             :         return 0;
    1523             : 
    1524             : cleanup:
    1525           0 :         rbio_orig_end_io(rbio, -EIO, 0);
    1526           0 :         return -EIO;
    1527             : 
    1528             : finish:
    1529           2 :         validate_rbio_for_rmw(rbio);
    1530           2 :         return 0;
    1531             : }
    1532             : 
    1533             : /*
    1534             :  * if the upper layers pass in a full stripe, we thank them by only allocating
    1535             :  * enough pages to hold the parity, and sending it all down quickly.
    1536             :  */
    1537          32 : static int full_stripe_write(struct btrfs_raid_bio *rbio)
    1538             : {
    1539             :         int ret;
    1540             : 
    1541          32 :         ret = alloc_rbio_parity_pages(rbio);
    1542          32 :         if (ret) {
    1543           0 :                 __free_raid_bio(rbio);
    1544           0 :                 return ret;
    1545             :         }
    1546             : 
    1547          32 :         ret = lock_stripe_add(rbio);
    1548          32 :         if (ret == 0)
    1549          32 :                 finish_rmw(rbio);
    1550             :         return 0;
    1551             : }
    1552             : 
    1553             : /*
    1554             :  * partial stripe writes get handed over to async helpers.
    1555             :  * We're really hoping to merge a few more writes into this
    1556             :  * rbio before calculating new parity
    1557             :  */
    1558           6 : static int partial_stripe_write(struct btrfs_raid_bio *rbio)
    1559             : {
    1560             :         int ret;
    1561             : 
    1562           6 :         ret = lock_stripe_add(rbio);
    1563           6 :         if (ret == 0)
    1564           6 :                 async_rmw_stripe(rbio);
    1565           6 :         return 0;
    1566             : }
    1567             : 
    1568             : /*
    1569             :  * sometimes while we were reading from the drive to
    1570             :  * recalculate parity, enough new bios come into create
    1571             :  * a full stripe.  So we do a check here to see if we can
    1572             :  * go directly to finish_rmw
    1573             :  */
    1574           6 : static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
    1575             : {
    1576             :         /* head off into rmw land if we don't have a full stripe */
    1577           6 :         if (!rbio_is_full(rbio))
    1578           6 :                 return partial_stripe_write(rbio);
    1579           0 :         return full_stripe_write(rbio);
    1580             : }
    1581             : 
    1582             : /*
    1583             :  * We use plugging call backs to collect full stripes.
    1584             :  * Any time we get a partial stripe write while plugged
    1585             :  * we collect it into a list.  When the unplug comes down,
    1586             :  * we sort the list by logical block number and merge
    1587             :  * everything we can into the same rbios
    1588             :  */
    1589             : struct btrfs_plug_cb {
    1590             :         struct blk_plug_cb cb;
    1591             :         struct btrfs_fs_info *info;
    1592             :         struct list_head rbio_list;
    1593             :         struct btrfs_work work;
    1594             : };
    1595             : 
    1596             : /*
    1597             :  * rbios on the plug list are sorted for easier merging.
    1598             :  */
    1599           2 : static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
    1600             : {
    1601             :         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
    1602             :                                                  plug_list);
    1603             :         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
    1604             :                                                  plug_list);
    1605           2 :         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
    1606           2 :         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
    1607             : 
    1608           2 :         if (a_sector < b_sector)
    1609             :                 return -1;
    1610           0 :         if (a_sector > b_sector)
    1611             :                 return 1;
    1612           0 :         return 0;
    1613             : }
    1614             : 
    1615           4 : static void run_plug(struct btrfs_plug_cb *plug)
    1616             : {
    1617             :         struct btrfs_raid_bio *cur;
    1618             :         struct btrfs_raid_bio *last = NULL;
    1619             : 
    1620             :         /*
    1621             :          * sort our plug list then try to merge
    1622             :          * everything we can in hopes of creating full
    1623             :          * stripes.
    1624             :          */
    1625           4 :         list_sort(NULL, &plug->rbio_list, plug_cmp);
    1626          14 :         while (!list_empty(&plug->rbio_list)) {
    1627           6 :                 cur = list_entry(plug->rbio_list.next,
    1628             :                                  struct btrfs_raid_bio, plug_list);
    1629           6 :                 list_del_init(&cur->plug_list);
    1630             : 
    1631           6 :                 if (rbio_is_full(cur)) {
    1632             :                         /* we have a full stripe, send it down */
    1633           0 :                         full_stripe_write(cur);
    1634           0 :                         continue;
    1635             :                 }
    1636           6 :                 if (last) {
    1637           2 :                         if (rbio_can_merge(last, cur)) {
    1638             :                                 merge_rbio(last, cur);
    1639           0 :                                 __free_raid_bio(cur);
    1640           0 :                                 continue;
    1641             : 
    1642             :                         }
    1643           2 :                         __raid56_parity_write(last);
    1644             :                 }
    1645             :                 last = cur;
    1646             :         }
    1647           4 :         if (last) {
    1648           4 :                 __raid56_parity_write(last);
    1649             :         }
    1650           4 :         kfree(plug);
    1651           4 : }
    1652             : 
    1653             : /*
    1654             :  * if the unplug comes from schedule, we have to push the
    1655             :  * work off to a helper thread
    1656             :  */
    1657           0 : static void unplug_work(struct btrfs_work *work)
    1658             : {
    1659             :         struct btrfs_plug_cb *plug;
    1660           0 :         plug = container_of(work, struct btrfs_plug_cb, work);
    1661           0 :         run_plug(plug);
    1662           0 : }
    1663             : 
    1664           4 : static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
    1665             : {
    1666             :         struct btrfs_plug_cb *plug;
    1667             :         plug = container_of(cb, struct btrfs_plug_cb, cb);
    1668             : 
    1669           4 :         if (from_schedule) {
    1670           0 :                 btrfs_init_work(&plug->work, btrfs_rmw_helper,
    1671             :                                 unplug_work, NULL, NULL);
    1672           0 :                 btrfs_queue_work(plug->info->rmw_workers,
    1673             :                                  &plug->work);
    1674           4 :                 return;
    1675             :         }
    1676           4 :         run_plug(plug);
    1677             : }
    1678             : 
    1679             : /*
    1680             :  * our main entry point for writes from the rest of the FS.
    1681             :  */
    1682          38 : int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
    1683             :                         struct btrfs_bio *bbio, u64 *raid_map,
    1684             :                         u64 stripe_len)
    1685             : {
    1686             :         struct btrfs_raid_bio *rbio;
    1687             :         struct btrfs_plug_cb *plug = NULL;
    1688             :         struct blk_plug_cb *cb;
    1689             : 
    1690          38 :         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
    1691          38 :         if (IS_ERR(rbio))
    1692           0 :                 return PTR_ERR(rbio);
    1693             :         bio_list_add(&rbio->bio_list, bio);
    1694          38 :         rbio->bio_list_bytes = bio->bi_iter.bi_size;
    1695             : 
    1696             :         /*
    1697             :          * don't plug on full rbios, just get them out the door
    1698             :          * as quickly as we can
    1699             :          */
    1700          38 :         if (rbio_is_full(rbio))
    1701          32 :                 return full_stripe_write(rbio);
    1702             : 
    1703           6 :         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
    1704             :                                sizeof(*plug));
    1705           6 :         if (cb) {
    1706             :                 plug = container_of(cb, struct btrfs_plug_cb, cb);
    1707           6 :                 if (!plug->info) {
    1708           4 :                         plug->info = root->fs_info;
    1709           4 :                         INIT_LIST_HEAD(&plug->rbio_list);
    1710             :                 }
    1711           6 :                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
    1712             :         } else {
    1713           0 :                 return __raid56_parity_write(rbio);
    1714             :         }
    1715           6 :         return 0;
    1716             : }
    1717             : 
    1718             : /*
    1719             :  * all parity reconstruction happens here.  We've read in everything
    1720             :  * we can find from the drives and this does the heavy lifting of
    1721             :  * sorting the good from the bad.
    1722             :  */
    1723           0 : static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
    1724             : {
    1725             :         int pagenr, stripe;
    1726             :         void **pointers;
    1727             :         int faila = -1, failb = -1;
    1728           0 :         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
    1729             :         struct page *page;
    1730             :         int err;
    1731             :         int i;
    1732             : 
    1733           0 :         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
    1734             :                            GFP_NOFS);
    1735           0 :         if (!pointers) {
    1736             :                 err = -ENOMEM;
    1737             :                 goto cleanup_io;
    1738             :         }
    1739             : 
    1740           0 :         faila = rbio->faila;
    1741           0 :         failb = rbio->failb;
    1742             : 
    1743           0 :         if (rbio->read_rebuild) {
    1744             :                 spin_lock_irq(&rbio->bio_list_lock);
    1745             :                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
    1746             :                 spin_unlock_irq(&rbio->bio_list_lock);
    1747             :         }
    1748             : 
    1749           0 :         index_rbio_pages(rbio);
    1750             : 
    1751           0 :         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
    1752             :                 /* setup our array of pointers with pages
    1753             :                  * from each stripe
    1754             :                  */
    1755           0 :                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
    1756             :                         /*
    1757             :                          * if we're rebuilding a read, we have to use
    1758             :                          * pages from the bio list
    1759             :                          */
    1760           0 :                         if (rbio->read_rebuild &&
    1761           0 :                             (stripe == faila || stripe == failb)) {
    1762           0 :                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
    1763             :                         } else {
    1764             :                                 page = rbio_stripe_page(rbio, stripe, pagenr);
    1765             :                         }
    1766           0 :                         pointers[stripe] = kmap(page);
    1767             :                 }
    1768             : 
    1769             :                 /* all raid6 handling here */
    1770           0 :                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
    1771             :                     RAID6_Q_STRIPE) {
    1772             : 
    1773             :                         /*
    1774             :                          * single failure, rebuild from parity raid5
    1775             :                          * style
    1776             :                          */
    1777           0 :                         if (failb < 0) {
    1778           0 :                                 if (faila == rbio->nr_data) {
    1779             :                                         /*
    1780             :                                          * Just the P stripe has failed, without
    1781             :                                          * a bad data or Q stripe.
    1782             :                                          * TODO, we should redo the xor here.
    1783             :                                          */
    1784             :                                         err = -EIO;
    1785             :                                         goto cleanup;
    1786             :                                 }
    1787             :                                 /*
    1788             :                                  * a single failure in raid6 is rebuilt
    1789             :                                  * in the pstripe code below
    1790             :                                  */
    1791             :                                 goto pstripe;
    1792             :                         }
    1793             : 
    1794             :                         /* make sure our ps and qs are in order */
    1795           0 :                         if (faila > failb) {
    1796             :                                 int tmp = failb;
    1797             :                                 failb = faila;
    1798             :                                 faila = tmp;
    1799             :                         }
    1800             : 
    1801             :                         /* if the q stripe is failed, do a pstripe reconstruction
    1802             :                          * from the xors.
    1803             :                          * If both the q stripe and the P stripe are failed, we're
    1804             :                          * here due to a crc mismatch and we can't give them the
    1805             :                          * data they want
    1806             :                          */
    1807           0 :                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
    1808           0 :                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
    1809             :                                         err = -EIO;
    1810             :                                         goto cleanup;
    1811             :                                 }
    1812             :                                 /*
    1813             :                                  * otherwise we have one bad data stripe and
    1814             :                                  * a good P stripe.  raid5!
    1815             :                                  */
    1816             :                                 goto pstripe;
    1817             :                         }
    1818             : 
    1819           0 :                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
    1820           0 :                                 raid6_datap_recov(rbio->bbio->num_stripes,
    1821             :                                                   PAGE_SIZE, faila, pointers);
    1822             :                         } else {
    1823           0 :                                 raid6_2data_recov(rbio->bbio->num_stripes,
    1824             :                                                   PAGE_SIZE, faila, failb,
    1825             :                                                   pointers);
    1826             :                         }
    1827             :                 } else {
    1828             :                         void *p;
    1829             : 
    1830             :                         /* rebuild from P stripe here (raid5 or raid6) */
    1831           0 :                         BUG_ON(failb != -1);
    1832             : pstripe:
    1833             :                         /* Copy parity block into failed block to start with */
    1834           0 :                         memcpy(pointers[faila],
    1835           0 :                                pointers[rbio->nr_data],
    1836             :                                PAGE_CACHE_SIZE);
    1837             : 
    1838             :                         /* rearrange the pointer array */
    1839           0 :                         p = pointers[faila];
    1840           0 :                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
    1841           0 :                                 pointers[stripe] = pointers[stripe + 1];
    1842           0 :                         pointers[rbio->nr_data - 1] = p;
    1843             : 
    1844             :                         /* xor in the rest */
    1845           0 :                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
    1846             :                 }
    1847             :                 /* if we're doing this rebuild as part of an rmw, go through
    1848             :                  * and set all of our private rbio pages in the
    1849             :                  * failed stripes as uptodate.  This way finish_rmw will
    1850             :                  * know they can be trusted.  If this was a read reconstruction,
    1851             :                  * other endio functions will fiddle the uptodate bits
    1852             :                  */
    1853           0 :                 if (!rbio->read_rebuild) {
    1854           0 :                         for (i = 0;  i < nr_pages; i++) {
    1855           0 :                                 if (faila != -1) {
    1856             :                                         page = rbio_stripe_page(rbio, faila, i);
    1857             :                                         SetPageUptodate(page);
    1858             :                                 }
    1859           0 :                                 if (failb != -1) {
    1860             :                                         page = rbio_stripe_page(rbio, failb, i);
    1861             :                                         SetPageUptodate(page);
    1862             :                                 }
    1863             :                         }
    1864             :                 }
    1865           0 :                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
    1866             :                         /*
    1867             :                          * if we're rebuilding a read, we have to use
    1868             :                          * pages from the bio list
    1869             :                          */
    1870           0 :                         if (rbio->read_rebuild &&
    1871           0 :                             (stripe == faila || stripe == failb)) {
    1872           0 :                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
    1873             :                         } else {
    1874             :                                 page = rbio_stripe_page(rbio, stripe, pagenr);
    1875             :                         }
    1876             :                         kunmap(page);
    1877             :                 }
    1878             :         }
    1879             : 
    1880             :         err = 0;
    1881             : cleanup:
    1882           0 :         kfree(pointers);
    1883             : 
    1884             : cleanup_io:
    1885             : 
    1886           0 :         if (rbio->read_rebuild) {
    1887           0 :                 if (err == 0)
    1888           0 :                         cache_rbio_pages(rbio);
    1889             :                 else
    1890             :                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
    1891             : 
    1892           0 :                 rbio_orig_end_io(rbio, err, err == 0);
    1893           0 :         } else if (err == 0) {
    1894           0 :                 rbio->faila = -1;
    1895           0 :                 rbio->failb = -1;
    1896           0 :                 finish_rmw(rbio);
    1897             :         } else {
    1898           0 :                 rbio_orig_end_io(rbio, err, 0);
    1899             :         }
    1900           0 : }
    1901             : 
    1902             : /*
    1903             :  * This is called only for stripes we've read from disk to
    1904             :  * reconstruct the parity.
    1905             :  */
    1906           0 : static void raid_recover_end_io(struct bio *bio, int err)
    1907             : {
    1908           0 :         struct btrfs_raid_bio *rbio = bio->bi_private;
    1909             : 
    1910             :         /*
    1911             :          * we only read stripe pages off the disk, set them
    1912             :          * up to date if there were no errors
    1913             :          */
    1914           0 :         if (err)
    1915           0 :                 fail_bio_stripe(rbio, bio);
    1916             :         else
    1917             :                 set_bio_pages_uptodate(bio);
    1918           0 :         bio_put(bio);
    1919             : 
    1920           0 :         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
    1921           0 :                 return;
    1922             : 
    1923           0 :         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
    1924           0 :                 rbio_orig_end_io(rbio, -EIO, 0);
    1925             :         else
    1926           0 :                 __raid_recover_end_io(rbio);
    1927             : }
    1928             : 
    1929             : /*
    1930             :  * reads everything we need off the disk to reconstruct
    1931             :  * the parity. endio handlers trigger final reconstruction
    1932             :  * when the IO is done.
    1933             :  *
    1934             :  * This is used both for reads from the higher layers and for
    1935             :  * parity construction required to finish a rmw cycle.
    1936             :  */
    1937           0 : static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
    1938             : {
    1939             :         int bios_to_read = 0;
    1940           0 :         struct btrfs_bio *bbio = rbio->bbio;
    1941             :         struct bio_list bio_list;
    1942             :         int ret;
    1943           0 :         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
    1944             :         int pagenr;
    1945             :         int stripe;
    1946             :         struct bio *bio;
    1947             : 
    1948             :         bio_list_init(&bio_list);
    1949             : 
    1950           0 :         ret = alloc_rbio_pages(rbio);
    1951           0 :         if (ret)
    1952             :                 goto cleanup;
    1953             : 
    1954           0 :         atomic_set(&rbio->bbio->error, 0);
    1955             : 
    1956             :         /*
    1957             :          * read everything that hasn't failed.  Thanks to the
    1958             :          * stripe cache, it is possible that some or all of these
    1959             :          * pages are going to be uptodate.
    1960             :          */
    1961           0 :         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
    1962           0 :                 if (rbio->faila == stripe || rbio->failb == stripe) {
    1963           0 :                         atomic_inc(&rbio->bbio->error);
    1964           0 :                         continue;
    1965             :                 }
    1966             : 
    1967           0 :                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
    1968             :                         struct page *p;
    1969             : 
    1970             :                         /*
    1971             :                          * the rmw code may have already read this
    1972             :                          * page in
    1973             :                          */
    1974             :                         p = rbio_stripe_page(rbio, stripe, pagenr);
    1975           0 :                         if (PageUptodate(p))
    1976           0 :                                 continue;
    1977             : 
    1978           0 :                         ret = rbio_add_io_page(rbio, &bio_list,
    1979             :                                        rbio_stripe_page(rbio, stripe, pagenr),
    1980           0 :                                        stripe, pagenr, rbio->stripe_len);
    1981           0 :                         if (ret < 0)
    1982             :                                 goto cleanup;
    1983             :                 }
    1984             :         }
    1985             : 
    1986           0 :         bios_to_read = bio_list_size(&bio_list);
    1987           0 :         if (!bios_to_read) {
    1988             :                 /*
    1989             :                  * we might have no bios to read just because the pages
    1990             :                  * were up to date, or we might have no bios to read because
    1991             :                  * the devices were gone.
    1992             :                  */
    1993           0 :                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
    1994           0 :                         __raid_recover_end_io(rbio);
    1995           0 :                         goto out;
    1996             :                 } else {
    1997             :                         goto cleanup;
    1998             :                 }
    1999             :         }
    2000             : 
    2001             :         /*
    2002             :          * the bbio may be freed once we submit the last bio.  Make sure
    2003             :          * not to touch it after that
    2004             :          */
    2005             :         atomic_set(&bbio->stripes_pending, bios_to_read);
    2006             :         while (1) {
    2007             :                 bio = bio_list_pop(&bio_list);
    2008           0 :                 if (!bio)
    2009             :                         break;
    2010             : 
    2011           0 :                 bio->bi_private = rbio;
    2012           0 :                 bio->bi_end_io = raid_recover_end_io;
    2013             : 
    2014           0 :                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
    2015             :                                     BTRFS_WQ_ENDIO_RAID56);
    2016             : 
    2017           0 :                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
    2018           0 :                 submit_bio(READ, bio);
    2019           0 :         }
    2020             : out:
    2021             :         return 0;
    2022             : 
    2023             : cleanup:
    2024           0 :         if (rbio->read_rebuild)
    2025           0 :                 rbio_orig_end_io(rbio, -EIO, 0);
    2026             :         return -EIO;
    2027             : }
    2028             : 
    2029             : /*
    2030             :  * the main entry point for reads from the higher layers.  This
    2031             :  * is really only called when the normal read path had a failure,
    2032             :  * so we assume the bio they send down corresponds to a failed part
    2033             :  * of the drive.
    2034             :  */
    2035           0 : int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
    2036             :                           struct btrfs_bio *bbio, u64 *raid_map,
    2037             :                           u64 stripe_len, int mirror_num)
    2038             : {
    2039           0 :         struct btrfs_raid_bio *rbio;
    2040             :         int ret;
    2041             : 
    2042           0 :         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
    2043           0 :         if (IS_ERR(rbio))
    2044           0 :                 return PTR_ERR(rbio);
    2045             : 
    2046           0 :         rbio->read_rebuild = 1;
    2047             :         bio_list_add(&rbio->bio_list, bio);
    2048           0 :         rbio->bio_list_bytes = bio->bi_iter.bi_size;
    2049             : 
    2050           0 :         rbio->faila = find_logical_bio_stripe(rbio, bio);
    2051           0 :         if (rbio->faila == -1) {
    2052           0 :                 BUG();
    2053             :                 kfree(raid_map);
    2054             :                 kfree(bbio);
    2055             :                 kfree(rbio);
    2056             :                 return -EIO;
    2057             :         }
    2058             : 
    2059             :         /*
    2060             :          * reconstruct from the q stripe if they are
    2061             :          * asking for mirror 3
    2062             :          */
    2063           0 :         if (mirror_num == 3)
    2064           0 :                 rbio->failb = bbio->num_stripes - 2;
    2065             : 
    2066           0 :         ret = lock_stripe_add(rbio);
    2067             : 
    2068             :         /*
    2069             :          * __raid56_parity_recover will end the bio with
    2070             :          * any errors it hits.  We don't want to return
    2071             :          * its error value up the stack because our caller
    2072             :          * will end up calling bio_endio with any nonzero
    2073             :          * return
    2074             :          */
    2075           0 :         if (ret == 0)
    2076           0 :                 __raid56_parity_recover(rbio);
    2077             :         /*
    2078             :          * our rbio has been added to the list of
    2079             :          * rbios that will be handled after the
    2080             :          * currently lock owner is done
    2081             :          */
    2082             :         return 0;
    2083             : 
    2084             : }
    2085             : 
    2086           6 : static void rmw_work(struct btrfs_work *work)
    2087             : {
    2088             :         struct btrfs_raid_bio *rbio;
    2089             : 
    2090           6 :         rbio = container_of(work, struct btrfs_raid_bio, work);
    2091           6 :         raid56_rmw_stripe(rbio);
    2092           6 : }
    2093             : 
    2094           0 : static void read_rebuild_work(struct btrfs_work *work)
    2095             : {
    2096             :         struct btrfs_raid_bio *rbio;
    2097             : 
    2098           0 :         rbio = container_of(work, struct btrfs_raid_bio, work);
    2099           0 :         __raid56_parity_recover(rbio);
    2100           0 : }

Generated by: LCOV version 1.10