Line data Source code
1 : /*
2 : * Copyright (C) 2012 Fusion-io All rights reserved.
3 : * Copyright (C) 2012 Intel Corp. All rights reserved.
4 : *
5 : * This program is free software; you can redistribute it and/or
6 : * modify it under the terms of the GNU General Public
7 : * License v2 as published by the Free Software Foundation.
8 : *
9 : * This program is distributed in the hope that it will be useful,
10 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 : * General Public License for more details.
13 : *
14 : * You should have received a copy of the GNU General Public
15 : * License along with this program; if not, write to the
16 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 : * Boston, MA 021110-1307, USA.
18 : */
19 : #include <linux/sched.h>
20 : #include <linux/wait.h>
21 : #include <linux/bio.h>
22 : #include <linux/slab.h>
23 : #include <linux/buffer_head.h>
24 : #include <linux/blkdev.h>
25 : #include <linux/random.h>
26 : #include <linux/iocontext.h>
27 : #include <linux/capability.h>
28 : #include <linux/ratelimit.h>
29 : #include <linux/kthread.h>
30 : #include <linux/raid/pq.h>
31 : #include <linux/hash.h>
32 : #include <linux/list_sort.h>
33 : #include <linux/raid/xor.h>
34 : #include <linux/vmalloc.h>
35 : #include <asm/div64.h>
36 : #include "ctree.h"
37 : #include "extent_map.h"
38 : #include "disk-io.h"
39 : #include "transaction.h"
40 : #include "print-tree.h"
41 : #include "volumes.h"
42 : #include "raid56.h"
43 : #include "async-thread.h"
44 : #include "check-integrity.h"
45 : #include "rcu-string.h"
46 :
47 : /* set when additional merges to this rbio are not allowed */
48 : #define RBIO_RMW_LOCKED_BIT 1
49 :
50 : /*
51 : * set when this rbio is sitting in the hash, but it is just a cache
52 : * of past RMW
53 : */
54 : #define RBIO_CACHE_BIT 2
55 :
56 : /*
57 : * set when it is safe to trust the stripe_pages for caching
58 : */
59 : #define RBIO_CACHE_READY_BIT 3
60 :
61 :
62 : #define RBIO_CACHE_SIZE 1024
63 :
64 : struct btrfs_raid_bio {
65 : struct btrfs_fs_info *fs_info;
66 : struct btrfs_bio *bbio;
67 :
68 : /*
69 : * logical block numbers for the start of each stripe
70 : * The last one or two are p/q. These are sorted,
71 : * so raid_map[0] is the start of our full stripe
72 : */
73 : u64 *raid_map;
74 :
75 : /* while we're doing rmw on a stripe
76 : * we put it into a hash table so we can
77 : * lock the stripe and merge more rbios
78 : * into it.
79 : */
80 : struct list_head hash_list;
81 :
82 : /*
83 : * LRU list for the stripe cache
84 : */
85 : struct list_head stripe_cache;
86 :
87 : /*
88 : * for scheduling work in the helper threads
89 : */
90 : struct btrfs_work work;
91 :
92 : /*
93 : * bio list and bio_list_lock are used
94 : * to add more bios into the stripe
95 : * in hopes of avoiding the full rmw
96 : */
97 : struct bio_list bio_list;
98 : spinlock_t bio_list_lock;
99 :
100 : /* also protected by the bio_list_lock, the
101 : * plug list is used by the plugging code
102 : * to collect partial bios while plugged. The
103 : * stripe locking code also uses it to hand off
104 : * the stripe lock to the next pending IO
105 : */
106 : struct list_head plug_list;
107 :
108 : /*
109 : * flags that tell us if it is safe to
110 : * merge with this bio
111 : */
112 : unsigned long flags;
113 :
114 : /* size of each individual stripe on disk */
115 : int stripe_len;
116 :
117 : /* number of data stripes (no p/q) */
118 : int nr_data;
119 :
120 : /*
121 : * set if we're doing a parity rebuild
122 : * for a read from higher up, which is handled
123 : * differently from a parity rebuild as part of
124 : * rmw
125 : */
126 : int read_rebuild;
127 :
128 : /* first bad stripe */
129 : int faila;
130 :
131 : /* second bad stripe (for raid6 use) */
132 : int failb;
133 :
134 : /*
135 : * number of pages needed to represent the full
136 : * stripe
137 : */
138 : int nr_pages;
139 :
140 : /*
141 : * size of all the bios in the bio_list. This
142 : * helps us decide if the rbio maps to a full
143 : * stripe or not
144 : */
145 : int bio_list_bytes;
146 :
147 : atomic_t refs;
148 :
149 : /*
150 : * these are two arrays of pointers. We allocate the
151 : * rbio big enough to hold them both and setup their
152 : * locations when the rbio is allocated
153 : */
154 :
155 : /* pointers to pages that we allocated for
156 : * reading/writing stripes directly from the disk (including P/Q)
157 : */
158 : struct page **stripe_pages;
159 :
160 : /*
161 : * pointers to the pages in the bio_list. Stored
162 : * here for faster lookup
163 : */
164 : struct page **bio_pages;
165 : };
166 :
167 : static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
168 : static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
169 : static void rmw_work(struct btrfs_work *work);
170 : static void read_rebuild_work(struct btrfs_work *work);
171 : static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
172 : static void async_read_rebuild(struct btrfs_raid_bio *rbio);
173 : static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
174 : static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
175 : static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176 : static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177 : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178 :
179 : /*
180 : * the stripe hash table is used for locking, and to collect
181 : * bios in hopes of making a full stripe
182 : */
183 221 : int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
184 : {
185 : struct btrfs_stripe_hash_table *table;
186 : struct btrfs_stripe_hash_table *x;
187 : struct btrfs_stripe_hash *cur;
188 : struct btrfs_stripe_hash *h;
189 : int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
190 : int i;
191 : int table_size;
192 :
193 221 : if (info->stripe_hash_table)
194 : return 0;
195 :
196 : /*
197 : * The table is large, starting with order 4 and can go as high as
198 : * order 7 in case lock debugging is turned on.
199 : *
200 : * Try harder to allocate and fallback to vmalloc to lower the chance
201 : * of a failing mount.
202 : */
203 : table_size = sizeof(*table) + sizeof(*h) * num_entries;
204 221 : table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
205 221 : if (!table) {
206 0 : table = vzalloc(table_size);
207 0 : if (!table)
208 : return -ENOMEM;
209 : }
210 :
211 221 : spin_lock_init(&table->cache_lock);
212 221 : INIT_LIST_HEAD(&table->stripe_cache);
213 :
214 221 : h = table->table;
215 :
216 452829 : for (i = 0; i < num_entries; i++) {
217 452608 : cur = h + i;
218 452608 : INIT_LIST_HEAD(&cur->hash_list);
219 452608 : spin_lock_init(&cur->lock);
220 452608 : init_waitqueue_head(&cur->wait);
221 : }
222 :
223 221 : x = cmpxchg(&info->stripe_hash_table, NULL, table);
224 221 : if (x) {
225 0 : if (is_vmalloc_addr(x))
226 0 : vfree(x);
227 : else
228 0 : kfree(x);
229 : }
230 : return 0;
231 : }
232 :
233 : /*
234 : * caching an rbio means to copy anything from the
235 : * bio_pages array into the stripe_pages array. We
236 : * use the page uptodate bit in the stripe cache array
237 : * to indicate if it has valid data
238 : *
239 : * once the caching is done, we set the cache ready
240 : * bit.
241 : */
242 6 : static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
243 : {
244 : int i;
245 : char *s;
246 : char *d;
247 : int ret;
248 :
249 6 : ret = alloc_rbio_pages(rbio);
250 6 : if (ret)
251 6 : return;
252 :
253 384 : for (i = 0; i < rbio->nr_pages; i++) {
254 384 : if (!rbio->bio_pages[i])
255 328 : continue;
256 :
257 : s = kmap(rbio->bio_pages[i]);
258 56 : d = kmap(rbio->stripe_pages[i]);
259 :
260 56 : memcpy(d, s, PAGE_CACHE_SIZE);
261 :
262 : kunmap(rbio->bio_pages[i]);
263 : kunmap(rbio->stripe_pages[i]);
264 56 : SetPageUptodate(rbio->stripe_pages[i]);
265 : }
266 : set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
267 : }
268 :
269 : /*
270 : * we hash on the first logical address of the stripe
271 : */
272 : static int rbio_bucket(struct btrfs_raid_bio *rbio)
273 : {
274 82 : u64 num = rbio->raid_map[0];
275 :
276 : /*
277 : * we shift down quite a bit. We're using byte
278 : * addressing, and most of the lower bits are zeros.
279 : * This tends to upset hash_64, and it consistently
280 : * returns just one or two different values.
281 : *
282 : * shifting off the lower bits fixes things.
283 : */
284 164 : return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
285 : }
286 :
287 : /*
288 : * stealing an rbio means taking all the uptodate pages from the stripe
289 : * array in the source rbio and putting them into the destination rbio
290 : */
291 2 : static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
292 : {
293 : int i;
294 : struct page *s;
295 : struct page *d;
296 :
297 2 : if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
298 2 : return;
299 :
300 128 : for (i = 0; i < dest->nr_pages; i++) {
301 128 : s = src->stripe_pages[i];
302 256 : if (!s || !PageUptodate(s)) {
303 0 : continue;
304 : }
305 :
306 128 : d = dest->stripe_pages[i];
307 128 : if (d)
308 0 : __free_page(d);
309 :
310 128 : dest->stripe_pages[i] = s;
311 128 : src->stripe_pages[i] = NULL;
312 : }
313 : }
314 :
315 : /*
316 : * merging means we take the bio_list from the victim and
317 : * splice it into the destination. The victim should
318 : * be discarded afterwards.
319 : *
320 : * must be called with dest->rbio_list_lock held
321 : */
322 : static void merge_rbio(struct btrfs_raid_bio *dest,
323 : struct btrfs_raid_bio *victim)
324 : {
325 : bio_list_merge(&dest->bio_list, &victim->bio_list);
326 0 : dest->bio_list_bytes += victim->bio_list_bytes;
327 : bio_list_init(&victim->bio_list);
328 : }
329 :
330 : /*
331 : * used to prune items that are in the cache. The caller
332 : * must hold the hash table lock.
333 : */
334 6 : static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
335 : {
336 : int bucket = rbio_bucket(rbio);
337 : struct btrfs_stripe_hash_table *table;
338 : struct btrfs_stripe_hash *h;
339 : int freeit = 0;
340 :
341 : /*
342 : * check the bit again under the hash table lock.
343 : */
344 6 : if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
345 6 : return;
346 :
347 6 : table = rbio->fs_info->stripe_hash_table;
348 6 : h = table->table + bucket;
349 :
350 : /* hold the lock for the bucket because we may be
351 : * removing it from the hash table
352 : */
353 : spin_lock(&h->lock);
354 :
355 : /*
356 : * hold the lock for the bio list because we need
357 : * to make sure the bio list is empty
358 : */
359 : spin_lock(&rbio->bio_list_lock);
360 :
361 12 : if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
362 6 : list_del_init(&rbio->stripe_cache);
363 6 : table->cache_size -= 1;
364 : freeit = 1;
365 :
366 : /* if the bio list isn't empty, this rbio is
367 : * still involved in an IO. We take it out
368 : * of the cache list, and drop the ref that
369 : * was held for the list.
370 : *
371 : * If the bio_list was empty, we also remove
372 : * the rbio from the hash_table, and drop
373 : * the corresponding ref
374 : */
375 6 : if (bio_list_empty(&rbio->bio_list)) {
376 12 : if (!list_empty(&rbio->hash_list)) {
377 : list_del_init(&rbio->hash_list);
378 4 : atomic_dec(&rbio->refs);
379 8 : BUG_ON(!list_empty(&rbio->plug_list));
380 : }
381 : }
382 : }
383 :
384 : spin_unlock(&rbio->bio_list_lock);
385 : spin_unlock(&h->lock);
386 :
387 6 : if (freeit)
388 6 : __free_raid_bio(rbio);
389 : }
390 :
391 : /*
392 : * prune a given rbio from the cache
393 : */
394 34 : static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
395 : {
396 : struct btrfs_stripe_hash_table *table;
397 : unsigned long flags;
398 :
399 34 : if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
400 34 : return;
401 :
402 2 : table = rbio->fs_info->stripe_hash_table;
403 :
404 2 : spin_lock_irqsave(&table->cache_lock, flags);
405 2 : __remove_rbio_from_cache(rbio);
406 : spin_unlock_irqrestore(&table->cache_lock, flags);
407 : }
408 :
409 : /*
410 : * remove everything in the cache
411 : */
412 221 : static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
413 : {
414 : struct btrfs_stripe_hash_table *table;
415 : unsigned long flags;
416 : struct btrfs_raid_bio *rbio;
417 :
418 221 : table = info->stripe_hash_table;
419 :
420 221 : spin_lock_irqsave(&table->cache_lock, flags);
421 450 : while (!list_empty(&table->stripe_cache)) {
422 4 : rbio = list_entry(table->stripe_cache.next,
423 : struct btrfs_raid_bio,
424 : stripe_cache);
425 4 : __remove_rbio_from_cache(rbio);
426 : }
427 : spin_unlock_irqrestore(&table->cache_lock, flags);
428 221 : }
429 :
430 : /*
431 : * remove all cached entries and free the hash table
432 : * used by unmount
433 : */
434 221 : void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
435 : {
436 221 : if (!info->stripe_hash_table)
437 221 : return;
438 221 : btrfs_clear_rbio_cache(info);
439 442 : if (is_vmalloc_addr(info->stripe_hash_table))
440 0 : vfree(info->stripe_hash_table);
441 : else
442 221 : kfree(info->stripe_hash_table);
443 221 : info->stripe_hash_table = NULL;
444 : }
445 :
446 : /*
447 : * insert an rbio into the stripe cache. It
448 : * must have already been prepared by calling
449 : * cache_rbio_pages
450 : *
451 : * If this rbio was already cached, it gets
452 : * moved to the front of the lru.
453 : *
454 : * If the size of the rbio cache is too big, we
455 : * prune an item.
456 : */
457 38 : static void cache_rbio(struct btrfs_raid_bio *rbio)
458 : {
459 : struct btrfs_stripe_hash_table *table;
460 : unsigned long flags;
461 :
462 38 : if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
463 : return;
464 :
465 6 : table = rbio->fs_info->stripe_hash_table;
466 :
467 6 : spin_lock_irqsave(&table->cache_lock, flags);
468 : spin_lock(&rbio->bio_list_lock);
469 :
470 : /* bump our ref if we were not in the list before */
471 12 : if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
472 6 : atomic_inc(&rbio->refs);
473 :
474 12 : if (!list_empty(&rbio->stripe_cache)){
475 0 : list_move(&rbio->stripe_cache, &table->stripe_cache);
476 : } else {
477 6 : list_add(&rbio->stripe_cache, &table->stripe_cache);
478 6 : table->cache_size += 1;
479 : }
480 :
481 : spin_unlock(&rbio->bio_list_lock);
482 :
483 6 : if (table->cache_size > RBIO_CACHE_SIZE) {
484 : struct btrfs_raid_bio *found;
485 :
486 0 : found = list_entry(table->stripe_cache.prev,
487 : struct btrfs_raid_bio,
488 : stripe_cache);
489 :
490 0 : if (found != rbio)
491 0 : __remove_rbio_from_cache(found);
492 : }
493 :
494 : spin_unlock_irqrestore(&table->cache_lock, flags);
495 : return;
496 : }
497 :
498 : /*
499 : * helper function to run the xor_blocks api. It is only
500 : * able to do MAX_XOR_BLOCKS at a time, so we need to
501 : * loop through.
502 : */
503 304 : static void run_xor(void **pages, int src_cnt, ssize_t len)
504 : {
505 : int src_off = 0;
506 : int xor_src_cnt = 0;
507 304 : void *dest = pages[src_cnt];
508 :
509 912 : while(src_cnt > 0) {
510 304 : xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
511 304 : xor_blocks(xor_src_cnt, len, dest, pages + src_off);
512 :
513 304 : src_cnt -= xor_src_cnt;
514 304 : src_off += xor_src_cnt;
515 : }
516 304 : }
517 :
518 : /*
519 : * returns true if the bio list inside this rbio
520 : * covers an entire stripe (no rmw required).
521 : * Must be called with the bio list lock held, or
522 : * at a time when you know it is impossible to add
523 : * new bios into the list
524 : */
525 : static int __rbio_is_full(struct btrfs_raid_bio *rbio)
526 : {
527 88 : unsigned long size = rbio->bio_list_bytes;
528 : int ret = 1;
529 :
530 88 : if (size != rbio->nr_data * rbio->stripe_len)
531 : ret = 0;
532 :
533 88 : BUG_ON(size > rbio->nr_data * rbio->stripe_len);
534 : return ret;
535 : }
536 :
537 176 : static int rbio_is_full(struct btrfs_raid_bio *rbio)
538 : {
539 : unsigned long flags;
540 : int ret;
541 :
542 88 : spin_lock_irqsave(&rbio->bio_list_lock, flags);
543 : ret = __rbio_is_full(rbio);
544 : spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
545 88 : return ret;
546 : }
547 :
548 : /*
549 : * returns 1 if it is safe to merge two rbios together.
550 : * The merging is safe if the two rbios correspond to
551 : * the same stripe and if they are both going in the same
552 : * direction (read vs write), and if neither one is
553 : * locked for final IO
554 : *
555 : * The caller is responsible for locking such that
556 : * rmw_locked is safe to test
557 : */
558 2 : static int rbio_can_merge(struct btrfs_raid_bio *last,
559 : struct btrfs_raid_bio *cur)
560 : {
561 4 : if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
562 : test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
563 : return 0;
564 :
565 : /*
566 : * we can't merge with cached rbios, since the
567 : * idea is that when we merge the destination
568 : * rbio is going to run our IO for us. We can
569 : * steal from cached rbio's though, other functions
570 : * handle that.
571 : */
572 4 : if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
573 : test_bit(RBIO_CACHE_BIT, &cur->flags))
574 : return 0;
575 :
576 4 : if (last->raid_map[0] !=
577 2 : cur->raid_map[0])
578 : return 0;
579 :
580 : /* reads can't merge with writes */
581 0 : if (last->read_rebuild !=
582 0 : cur->read_rebuild) {
583 : return 0;
584 : }
585 :
586 0 : return 1;
587 : }
588 :
589 : /*
590 : * helper to index into the pstripe
591 : */
592 : static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
593 : {
594 608 : index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
595 608 : return rbio->stripe_pages[index];
596 : }
597 :
598 : /*
599 : * helper to index into the qstripe, returns null
600 : * if there is no qstripe
601 : */
602 : static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
603 : {
604 304 : if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
605 : return NULL;
606 :
607 304 : index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
608 : PAGE_CACHE_SHIFT;
609 304 : return rbio->stripe_pages[index];
610 : }
611 :
612 : /*
613 : * The first stripe in the table for a logical address
614 : * has the lock. rbios are added in one of three ways:
615 : *
616 : * 1) Nobody has the stripe locked yet. The rbio is given
617 : * the lock and 0 is returned. The caller must start the IO
618 : * themselves.
619 : *
620 : * 2) Someone has the stripe locked, but we're able to merge
621 : * with the lock owner. The rbio is freed and the IO will
622 : * start automatically along with the existing rbio. 1 is returned.
623 : *
624 : * 3) Someone has the stripe locked, but we're not able to merge.
625 : * The rbio is added to the lock owner's plug list, or merged into
626 : * an rbio already on the plug list. When the lock owner unlocks,
627 : * the next rbio on the list is run and the IO is started automatically.
628 : * 1 is returned
629 : *
630 : * If we return 0, the caller still owns the rbio and must continue with
631 : * IO submission. If we return 1, the caller must assume the rbio has
632 : * already been freed.
633 : */
634 38 : static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
635 : {
636 : int bucket = rbio_bucket(rbio);
637 38 : struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
638 : struct btrfs_raid_bio *cur;
639 : struct btrfs_raid_bio *pending;
640 : unsigned long flags;
641 76 : DEFINE_WAIT(wait);
642 : struct btrfs_raid_bio *freeit = NULL;
643 : struct btrfs_raid_bio *cache_drop = NULL;
644 : int ret = 0;
645 : int walk = 0;
646 :
647 38 : spin_lock_irqsave(&h->lock, flags);
648 38 : list_for_each_entry(cur, &h->hash_list, hash_list) {
649 : walk++;
650 2 : if (cur->raid_map[0] == rbio->raid_map[0]) {
651 : spin_lock(&cur->bio_list_lock);
652 :
653 : /* can we steal this cached rbio's pages? */
654 4 : if (bio_list_empty(&cur->bio_list) &&
655 4 : list_empty(&cur->plug_list) &&
656 2 : test_bit(RBIO_CACHE_BIT, &cur->flags) &&
657 : !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
658 : list_del_init(&cur->hash_list);
659 2 : atomic_dec(&cur->refs);
660 :
661 2 : steal_rbio(cur, rbio);
662 : cache_drop = cur;
663 : spin_unlock(&cur->bio_list_lock);
664 :
665 : goto lockit;
666 : }
667 :
668 : /* can we merge into the lock owner? */
669 0 : if (rbio_can_merge(cur, rbio)) {
670 : merge_rbio(cur, rbio);
671 : spin_unlock(&cur->bio_list_lock);
672 : freeit = rbio;
673 : ret = 1;
674 0 : goto out;
675 : }
676 :
677 :
678 : /*
679 : * we couldn't merge with the running
680 : * rbio, see if we can merge with the
681 : * pending ones. We don't have to
682 : * check for rmw_locked because there
683 : * is no way they are inside finish_rmw
684 : * right now
685 : */
686 0 : list_for_each_entry(pending, &cur->plug_list,
687 : plug_list) {
688 0 : if (rbio_can_merge(pending, rbio)) {
689 : merge_rbio(pending, rbio);
690 : spin_unlock(&cur->bio_list_lock);
691 : freeit = rbio;
692 : ret = 1;
693 0 : goto out;
694 : }
695 : }
696 :
697 : /* no merging, put us on the tail of the plug list,
698 : * our rbio will be started with the currently
699 : * running rbio unlocks
700 : */
701 0 : list_add_tail(&rbio->plug_list, &cur->plug_list);
702 : spin_unlock(&cur->bio_list_lock);
703 : ret = 1;
704 0 : goto out;
705 : }
706 : }
707 : lockit:
708 38 : atomic_inc(&rbio->refs);
709 38 : list_add(&rbio->hash_list, &h->hash_list);
710 : out:
711 : spin_unlock_irqrestore(&h->lock, flags);
712 38 : if (cache_drop)
713 2 : remove_rbio_from_cache(cache_drop);
714 38 : if (freeit)
715 0 : __free_raid_bio(freeit);
716 38 : return ret;
717 : }
718 :
719 : /*
720 : * called as rmw or parity rebuild is completed. If the plug list has more
721 : * rbios waiting for this stripe, the next one on the list will be started
722 : */
723 38 : static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
724 : {
725 : int bucket;
726 : struct btrfs_stripe_hash *h;
727 : unsigned long flags;
728 : int keep_cache = 0;
729 :
730 : bucket = rbio_bucket(rbio);
731 38 : h = rbio->fs_info->stripe_hash_table->table + bucket;
732 :
733 76 : if (list_empty(&rbio->plug_list))
734 38 : cache_rbio(rbio);
735 :
736 38 : spin_lock_irqsave(&h->lock, flags);
737 : spin_lock(&rbio->bio_list_lock);
738 :
739 76 : if (!list_empty(&rbio->hash_list)) {
740 : /*
741 : * if we're still cached and there is no other IO
742 : * to perform, just leave this rbio here for others
743 : * to steal from later
744 : */
745 76 : if (list_empty(&rbio->plug_list) &&
746 : test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
747 : keep_cache = 1;
748 : clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
749 12 : BUG_ON(!bio_list_empty(&rbio->bio_list));
750 : goto done;
751 : }
752 :
753 : list_del_init(&rbio->hash_list);
754 32 : atomic_dec(&rbio->refs);
755 :
756 : /*
757 : * we use the plug list to hold all the rbios
758 : * waiting for the chance to lock this stripe.
759 : * hand the lock over to one of them.
760 : */
761 32 : if (!list_empty(&rbio->plug_list)) {
762 : struct btrfs_raid_bio *next;
763 : struct list_head *head = rbio->plug_list.next;
764 :
765 0 : next = list_entry(head, struct btrfs_raid_bio,
766 : plug_list);
767 :
768 : list_del_init(&rbio->plug_list);
769 :
770 0 : list_add(&next->hash_list, &h->hash_list);
771 0 : atomic_inc(&next->refs);
772 : spin_unlock(&rbio->bio_list_lock);
773 : spin_unlock_irqrestore(&h->lock, flags);
774 :
775 0 : if (next->read_rebuild)
776 0 : async_read_rebuild(next);
777 : else {
778 0 : steal_rbio(rbio, next);
779 0 : async_rmw_stripe(next);
780 : }
781 :
782 : goto done_nolock;
783 32 : } else if (waitqueue_active(&h->wait)) {
784 : spin_unlock(&rbio->bio_list_lock);
785 : spin_unlock_irqrestore(&h->lock, flags);
786 0 : wake_up(&h->wait);
787 0 : goto done_nolock;
788 : }
789 : }
790 : done:
791 : spin_unlock(&rbio->bio_list_lock);
792 : spin_unlock_irqrestore(&h->lock, flags);
793 :
794 : done_nolock:
795 38 : if (!keep_cache)
796 32 : remove_rbio_from_cache(rbio);
797 38 : }
798 :
799 44 : static void __free_raid_bio(struct btrfs_raid_bio *rbio)
800 : {
801 : int i;
802 :
803 44 : WARN_ON(atomic_read(&rbio->refs) < 0);
804 88 : if (!atomic_dec_and_test(&rbio->refs))
805 44 : return;
806 :
807 76 : WARN_ON(!list_empty(&rbio->stripe_cache));
808 76 : WARN_ON(!list_empty(&rbio->hash_list));
809 76 : WARN_ON(!bio_list_empty(&rbio->bio_list));
810 :
811 2432 : for (i = 0; i < rbio->nr_pages; i++) {
812 2432 : if (rbio->stripe_pages[i]) {
813 1024 : __free_page(rbio->stripe_pages[i]);
814 1024 : rbio->stripe_pages[i] = NULL;
815 : }
816 : }
817 38 : kfree(rbio->raid_map);
818 38 : kfree(rbio->bbio);
819 38 : kfree(rbio);
820 : }
821 :
822 : static void free_raid_bio(struct btrfs_raid_bio *rbio)
823 : {
824 38 : unlock_stripe(rbio);
825 38 : __free_raid_bio(rbio);
826 : }
827 :
828 : /*
829 : * this frees the rbio and runs through all the bios in the
830 : * bio_list and calls end_io on them
831 : */
832 38 : static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
833 : {
834 : struct bio *cur = bio_list_get(&rbio->bio_list);
835 : struct bio *next;
836 : free_raid_bio(rbio);
837 :
838 76 : while (cur) {
839 38 : next = cur->bi_next;
840 38 : cur->bi_next = NULL;
841 38 : if (uptodate)
842 : set_bit(BIO_UPTODATE, &cur->bi_flags);
843 38 : bio_endio(cur, err);
844 : cur = next;
845 : }
846 38 : }
847 :
848 : /*
849 : * end io function used by finish_rmw. When we finally
850 : * get here, we've written a full stripe
851 : */
852 145 : static void raid_write_end_io(struct bio *bio, int err)
853 : {
854 145 : struct btrfs_raid_bio *rbio = bio->bi_private;
855 :
856 145 : if (err)
857 0 : fail_bio_stripe(rbio, bio);
858 :
859 145 : bio_put(bio);
860 :
861 290 : if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
862 : return;
863 :
864 : err = 0;
865 :
866 : /* OK, we have read all the stripes we need to. */
867 76 : if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
868 : err = -EIO;
869 :
870 38 : rbio_orig_end_io(rbio, err, 0);
871 38 : return;
872 : }
873 :
874 : /*
875 : * the read/modify/write code wants to use the original bio for
876 : * any pages it included, and then use the rbio for everything
877 : * else. This function decides if a given index (stripe number)
878 : * and page number in that stripe fall inside the original bio
879 : * or the rbio.
880 : *
881 : * if you set bio_list_only, you'll get a NULL back for any ranges
882 : * that are outside the bio_list
883 : *
884 : * This doesn't take any refs on anything, you get a bare page pointer
885 : * and the caller must bump refs as required.
886 : *
887 : * You must call index_rbio_pages once before you can trust
888 : * the answers from this function.
889 : */
890 5712 : static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
891 : int index, int pagenr, int bio_list_only)
892 : {
893 : int chunk_page;
894 : struct page *p = NULL;
895 :
896 5712 : chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
897 :
898 : spin_lock_irq(&rbio->bio_list_lock);
899 5712 : p = rbio->bio_pages[chunk_page];
900 : spin_unlock_irq(&rbio->bio_list_lock);
901 :
902 5712 : if (p || bio_list_only)
903 : return p;
904 :
905 1280 : return rbio->stripe_pages[chunk_page];
906 : }
907 :
908 : /*
909 : * number of pages we need for the entire stripe across all the
910 : * drives
911 : */
912 : static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913 : {
914 38 : unsigned long nr = stripe_len * nr_stripes;
915 38 : return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
916 : }
917 :
918 : /*
919 : * allocation and initial setup for the btrfs_raid_bio. Not
920 : * this does not allocate any pages for rbio->pages.
921 : */
922 38 : static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
923 : struct btrfs_bio *bbio, u64 *raid_map,
924 : u64 stripe_len)
925 : {
926 : struct btrfs_raid_bio *rbio;
927 : int nr_data = 0;
928 76 : int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
929 : void *p;
930 :
931 38 : rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
932 : GFP_NOFS);
933 38 : if (!rbio) {
934 0 : kfree(raid_map);
935 0 : kfree(bbio);
936 : return ERR_PTR(-ENOMEM);
937 : }
938 :
939 : bio_list_init(&rbio->bio_list);
940 38 : INIT_LIST_HEAD(&rbio->plug_list);
941 38 : spin_lock_init(&rbio->bio_list_lock);
942 38 : INIT_LIST_HEAD(&rbio->stripe_cache);
943 38 : INIT_LIST_HEAD(&rbio->hash_list);
944 38 : rbio->bbio = bbio;
945 38 : rbio->raid_map = raid_map;
946 38 : rbio->fs_info = root->fs_info;
947 38 : rbio->stripe_len = stripe_len;
948 38 : rbio->nr_pages = num_pages;
949 38 : rbio->faila = -1;
950 38 : rbio->failb = -1;
951 : atomic_set(&rbio->refs, 1);
952 :
953 : /*
954 : * the stripe_pages and bio_pages array point to the extra
955 : * memory we allocated past the end of the rbio
956 : */
957 38 : p = rbio + 1;
958 38 : rbio->stripe_pages = p;
959 38 : rbio->bio_pages = p + sizeof(struct page *) * num_pages;
960 :
961 38 : if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
962 19 : nr_data = bbio->num_stripes - 2;
963 : else
964 19 : nr_data = bbio->num_stripes - 1;
965 :
966 38 : rbio->nr_data = nr_data;
967 : return rbio;
968 : }
969 :
970 : /* allocate pages for all the stripes in the bio, including parity */
971 12 : static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
972 : {
973 : int i;
974 : struct page *page;
975 :
976 768 : for (i = 0; i < rbio->nr_pages; i++) {
977 768 : if (rbio->stripe_pages[i])
978 512 : continue;
979 : page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
980 246 : if (!page)
981 : return -ENOMEM;
982 247 : rbio->stripe_pages[i] = page;
983 : ClearPageUptodate(page);
984 : }
985 : return 0;
986 : }
987 :
988 : /* allocate pages for just the p/q stripes */
989 32 : static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
990 : {
991 : int i;
992 : struct page *page;
993 :
994 32 : i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
995 :
996 800 : for (; i < rbio->nr_pages; i++) {
997 768 : if (rbio->stripe_pages[i])
998 0 : continue;
999 : page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1000 768 : if (!page)
1001 : return -ENOMEM;
1002 768 : rbio->stripe_pages[i] = page;
1003 : }
1004 : return 0;
1005 : }
1006 :
1007 : /*
1008 : * add a single page from a specific stripe into our list of bios for IO
1009 : * this will try to merge into existing bios if possible, and returns
1010 : * zero if all went well.
1011 : */
1012 2372 : static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1013 : struct bio_list *bio_list,
1014 : struct page *page,
1015 : int stripe_nr,
1016 : unsigned long page_index,
1017 : unsigned long bio_max_len)
1018 : {
1019 2372 : struct bio *last = bio_list->tail;
1020 : u64 last_end = 0;
1021 : int ret;
1022 : struct bio *bio;
1023 : struct btrfs_bio_stripe *stripe;
1024 : u64 disk_start;
1025 :
1026 2372 : stripe = &rbio->bbio->stripes[stripe_nr];
1027 2372 : disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1028 :
1029 : /* if the device is missing, just fail this stripe */
1030 2372 : if (!stripe->dev->bdev)
1031 0 : return fail_rbio_index(rbio, stripe_nr);
1032 :
1033 : /* see if we can add this page onto our existing bio */
1034 2372 : if (last) {
1035 2330 : last_end = (u64)last->bi_iter.bi_sector << 9;
1036 2330 : last_end += last->bi_iter.bi_size;
1037 :
1038 : /*
1039 : * we can't merge these if they are from different
1040 : * devices or if they are not contiguous
1041 : */
1042 4548 : if (last_end == disk_start && stripe->dev->bdev &&
1043 2218 : test_bit(BIO_UPTODATE, &last->bi_flags) &&
1044 2218 : last->bi_bdev == stripe->dev->bdev) {
1045 2217 : ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1046 2217 : if (ret == PAGE_CACHE_SIZE)
1047 : return 0;
1048 : }
1049 : }
1050 :
1051 : /* put a new bio on the list */
1052 155 : bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1053 155 : if (!bio)
1054 : return -ENOMEM;
1055 :
1056 155 : bio->bi_iter.bi_size = 0;
1057 155 : bio->bi_bdev = stripe->dev->bdev;
1058 155 : bio->bi_iter.bi_sector = disk_start >> 9;
1059 : set_bit(BIO_UPTODATE, &bio->bi_flags);
1060 :
1061 155 : bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1062 : bio_list_add(bio_list, bio);
1063 155 : return 0;
1064 : }
1065 :
1066 : /*
1067 : * while we're doing the read/modify/write cycle, we could
1068 : * have errors in reading pages off the disk. This checks
1069 : * for errors and if we're not able to read the page it'll
1070 : * trigger parity reconstruction. The rmw will be finished
1071 : * after we've reconstructed the failed stripes
1072 : */
1073 6 : static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074 : {
1075 6 : if (rbio->faila >= 0 || rbio->failb >= 0) {
1076 0 : BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1077 0 : __raid56_parity_recover(rbio);
1078 : } else {
1079 6 : finish_rmw(rbio);
1080 : }
1081 6 : }
1082 :
1083 : /*
1084 : * these are just the pages from the rbio array, not from anything
1085 : * the FS sent down to us
1086 : */
1087 : static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1088 : {
1089 : int index;
1090 1096 : index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1091 1096 : index += page;
1092 1096 : return rbio->stripe_pages[index];
1093 : }
1094 :
1095 : /*
1096 : * helper function to walk our bio list and populate the bio_pages array with
1097 : * the result. This seems expensive, but it is faster than constantly
1098 : * searching through the bio list as we setup the IO in finish_rmw or stripe
1099 : * reconstruction.
1100 : *
1101 : * This must be called before you trust the answers from page_in_rbio
1102 : */
1103 44 : static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1104 : {
1105 : struct bio *bio;
1106 : u64 start;
1107 : unsigned long stripe_offset;
1108 : unsigned long page_index;
1109 : struct page *p;
1110 : int i;
1111 :
1112 : spin_lock_irq(&rbio->bio_list_lock);
1113 88 : bio_list_for_each(bio, &rbio->bio_list) {
1114 44 : start = (u64)bio->bi_iter.bi_sector << 9;
1115 44 : stripe_offset = start - rbio->raid_map[0];
1116 44 : page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117 :
1118 1436 : for (i = 0; i < bio->bi_vcnt; i++) {
1119 1392 : p = bio->bi_io_vec[i].bv_page;
1120 1392 : rbio->bio_pages[page_index + i] = p;
1121 : }
1122 : }
1123 : spin_unlock_irq(&rbio->bio_list_lock);
1124 44 : }
1125 :
1126 : /*
1127 : * this is called from one of two situations. We either
1128 : * have a full stripe from the higher layers, or we've read all
1129 : * the missing bits off disk.
1130 : *
1131 : * This will calculate the parity and then send down any
1132 : * changed blocks.
1133 : */
1134 1558 : static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135 : {
1136 38 : struct btrfs_bio *bbio = rbio->bbio;
1137 38 : void *pointers[bbio->num_stripes];
1138 38 : int stripe_len = rbio->stripe_len;
1139 38 : int nr_data = rbio->nr_data;
1140 : int stripe;
1141 : int pagenr;
1142 : int p_stripe = -1;
1143 : int q_stripe = -1;
1144 : struct bio_list bio_list;
1145 : struct bio *bio;
1146 38 : int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1147 : int ret;
1148 :
1149 : bio_list_init(&bio_list);
1150 :
1151 38 : if (bbio->num_stripes - rbio->nr_data == 1) {
1152 : p_stripe = bbio->num_stripes - 1;
1153 19 : } else if (bbio->num_stripes - rbio->nr_data == 2) {
1154 : p_stripe = bbio->num_stripes - 2;
1155 19 : q_stripe = bbio->num_stripes - 1;
1156 : } else {
1157 0 : BUG();
1158 : }
1159 :
1160 : /* at this point we either have a full stripe,
1161 : * or we've read the full stripe from the drive.
1162 : * recalculate the parity and write the new results.
1163 : *
1164 : * We're not allowed to add any new bios to the
1165 : * bio list here, anyone else that wants to
1166 : * change this stripe needs to do their own rmw.
1167 : */
1168 : spin_lock_irq(&rbio->bio_list_lock);
1169 : set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170 : spin_unlock_irq(&rbio->bio_list_lock);
1171 :
1172 38 : atomic_set(&rbio->bbio->error, 0);
1173 :
1174 : /*
1175 : * now that we've set rmw_locked, run through the
1176 : * bio list one last time and map the page pointers
1177 : *
1178 : * We don't cache full rbios because we're assuming
1179 : * the higher layers are unlikely to use this area of
1180 : * the disk again soon. If they do use it again,
1181 : * hopefully they will send another full bio.
1182 : */
1183 38 : index_rbio_pages(rbio);
1184 38 : if (!rbio_is_full(rbio))
1185 6 : cache_rbio_pages(rbio);
1186 : else
1187 : clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1188 :
1189 608 : for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1190 : struct page *p;
1191 : /* first collect one page from each data stripe */
1192 1520 : for (stripe = 0; stripe < nr_data; stripe++) {
1193 1520 : p = page_in_rbio(rbio, stripe, pagenr, 0);
1194 1520 : pointers[stripe] = kmap(p);
1195 : }
1196 :
1197 : /* then add the parity stripe */
1198 : p = rbio_pstripe_page(rbio, pagenr);
1199 : SetPageUptodate(p);
1200 1216 : pointers[stripe++] = kmap(p);
1201 :
1202 608 : if (q_stripe != -1) {
1203 :
1204 : /*
1205 : * raid6, add the qstripe and call the
1206 : * library function to fill in our p/q
1207 : */
1208 : p = rbio_qstripe_page(rbio, pagenr);
1209 : SetPageUptodate(p);
1210 304 : pointers[stripe++] = kmap(p);
1211 :
1212 304 : raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1213 : pointers);
1214 : } else {
1215 : /* raid5 */
1216 304 : memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1217 304 : run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1218 : }
1219 :
1220 :
1221 2432 : for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1222 2432 : kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223 : }
1224 :
1225 : /*
1226 : * time to start writing. Make bios for everything from the
1227 : * higher layers (the bio_list in our rbio) and our p/q. Ignore
1228 : * everything else.
1229 : */
1230 152 : for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1231 2432 : for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232 : struct page *page;
1233 2432 : if (stripe < rbio->nr_data) {
1234 1520 : page = page_in_rbio(rbio, stripe, pagenr, 1);
1235 1520 : if (!page)
1236 184 : continue;
1237 : } else {
1238 : page = rbio_stripe_page(rbio, stripe, pagenr);
1239 : }
1240 :
1241 2248 : ret = rbio_add_io_page(rbio, &bio_list,
1242 2248 : page, stripe, pagenr, rbio->stripe_len);
1243 2248 : if (ret)
1244 : goto cleanup;
1245 : }
1246 : }
1247 :
1248 76 : atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1249 38 : BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1250 :
1251 : while (1) {
1252 : bio = bio_list_pop(&bio_list);
1253 183 : if (!bio)
1254 : break;
1255 :
1256 145 : bio->bi_private = rbio;
1257 145 : bio->bi_end_io = raid_write_end_io;
1258 145 : BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1259 145 : submit_bio(WRITE, bio);
1260 145 : }
1261 38 : return;
1262 :
1263 : cleanup:
1264 0 : rbio_orig_end_io(rbio, -EIO, 0);
1265 : }
1266 :
1267 : /*
1268 : * helper to find the stripe number for a given bio. Used to figure out which
1269 : * stripe has failed. This expects the bio to correspond to a physical disk,
1270 : * so it looks up based on physical sector numbers.
1271 : */
1272 : static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273 : struct bio *bio)
1274 : {
1275 : u64 physical = bio->bi_iter.bi_sector;
1276 : u64 stripe_start;
1277 : int i;
1278 : struct btrfs_bio_stripe *stripe;
1279 :
1280 0 : physical <<= 9;
1281 :
1282 0 : for (i = 0; i < rbio->bbio->num_stripes; i++) {
1283 0 : stripe = &rbio->bbio->stripes[i];
1284 0 : stripe_start = stripe->physical;
1285 0 : if (physical >= stripe_start &&
1286 0 : physical < stripe_start + rbio->stripe_len) {
1287 : return i;
1288 : }
1289 : }
1290 : return -1;
1291 : }
1292 :
1293 : /*
1294 : * helper to find the stripe number for a given
1295 : * bio (before mapping). Used to figure out which stripe has
1296 : * failed. This looks up based on logical block numbers.
1297 : */
1298 : static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299 : struct bio *bio)
1300 : {
1301 : u64 logical = bio->bi_iter.bi_sector;
1302 : u64 stripe_start;
1303 : int i;
1304 :
1305 0 : logical <<= 9;
1306 :
1307 0 : for (i = 0; i < rbio->nr_data; i++) {
1308 0 : stripe_start = rbio->raid_map[i];
1309 0 : if (logical >= stripe_start &&
1310 0 : logical < stripe_start + rbio->stripe_len) {
1311 : return i;
1312 : }
1313 : }
1314 : return -1;
1315 : }
1316 :
1317 : /*
1318 : * returns -EIO if we had too many failures
1319 : */
1320 0 : static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1321 : {
1322 : unsigned long flags;
1323 : int ret = 0;
1324 :
1325 0 : spin_lock_irqsave(&rbio->bio_list_lock, flags);
1326 :
1327 : /* we already know this stripe is bad, move on */
1328 0 : if (rbio->faila == failed || rbio->failb == failed)
1329 : goto out;
1330 :
1331 0 : if (rbio->faila == -1) {
1332 : /* first failure on this rbio */
1333 0 : rbio->faila = failed;
1334 0 : atomic_inc(&rbio->bbio->error);
1335 0 : } else if (rbio->failb == -1) {
1336 : /* second failure on this rbio */
1337 0 : rbio->failb = failed;
1338 0 : atomic_inc(&rbio->bbio->error);
1339 : } else {
1340 : ret = -EIO;
1341 : }
1342 : out:
1343 : spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1344 :
1345 0 : return ret;
1346 : }
1347 :
1348 : /*
1349 : * helper to fail a stripe based on a physical disk
1350 : * bio.
1351 : */
1352 0 : static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1353 0 : struct bio *bio)
1354 : {
1355 : int failed = find_bio_stripe(rbio, bio);
1356 :
1357 0 : if (failed < 0)
1358 : return -EIO;
1359 :
1360 0 : return fail_rbio_index(rbio, failed);
1361 : }
1362 :
1363 : /*
1364 : * this sets each page in the bio uptodate. It should only be used on private
1365 : * rbio pages, nothing that comes in from the higher layers
1366 : */
1367 : static void set_bio_pages_uptodate(struct bio *bio)
1368 : {
1369 : int i;
1370 : struct page *p;
1371 :
1372 124 : for (i = 0; i < bio->bi_vcnt; i++) {
1373 124 : p = bio->bi_io_vec[i].bv_page;
1374 : SetPageUptodate(p);
1375 : }
1376 : }
1377 :
1378 : /*
1379 : * end io for the read phase of the rmw cycle. All the bios here are physical
1380 : * stripe bios we've read from the disk so we can recalculate the parity of the
1381 : * stripe.
1382 : *
1383 : * This will usually kick off finish_rmw once all the bios are read in, but it
1384 : * may trigger parity reconstruction if we had any errors along the way
1385 : */
1386 10 : static void raid_rmw_end_io(struct bio *bio, int err)
1387 : {
1388 10 : struct btrfs_raid_bio *rbio = bio->bi_private;
1389 :
1390 10 : if (err)
1391 0 : fail_bio_stripe(rbio, bio);
1392 : else
1393 : set_bio_pages_uptodate(bio);
1394 :
1395 10 : bio_put(bio);
1396 :
1397 20 : if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1398 : return;
1399 :
1400 : err = 0;
1401 8 : if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1402 : goto cleanup;
1403 :
1404 : /*
1405 : * this will normally call finish_rmw to start our write
1406 : * but if there are any failed stripes we'll reconstruct
1407 : * from parity first
1408 : */
1409 4 : validate_rbio_for_rmw(rbio);
1410 4 : return;
1411 :
1412 : cleanup:
1413 :
1414 0 : rbio_orig_end_io(rbio, -EIO, 0);
1415 : }
1416 :
1417 6 : static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418 : {
1419 6 : btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1420 : rmw_work, NULL, NULL);
1421 :
1422 6 : btrfs_queue_work(rbio->fs_info->rmw_workers,
1423 : &rbio->work);
1424 6 : }
1425 :
1426 0 : static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427 : {
1428 0 : btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1429 : read_rebuild_work, NULL, NULL);
1430 :
1431 0 : btrfs_queue_work(rbio->fs_info->rmw_workers,
1432 : &rbio->work);
1433 0 : }
1434 :
1435 : /*
1436 : * the stripe must be locked by the caller. It will
1437 : * unlock after all the writes are done
1438 : */
1439 190 : static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440 : {
1441 : int bios_to_read = 0;
1442 6 : struct btrfs_bio *bbio = rbio->bbio;
1443 : struct bio_list bio_list;
1444 : int ret;
1445 6 : int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1446 : int pagenr;
1447 : int stripe;
1448 : struct bio *bio;
1449 :
1450 : bio_list_init(&bio_list);
1451 :
1452 6 : ret = alloc_rbio_pages(rbio);
1453 6 : if (ret)
1454 : goto cleanup;
1455 :
1456 6 : index_rbio_pages(rbio);
1457 :
1458 6 : atomic_set(&rbio->bbio->error, 0);
1459 : /*
1460 : * build a list of bios to read all the missing parts of this
1461 : * stripe
1462 : */
1463 21 : for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1464 240 : for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1465 : struct page *page;
1466 : /*
1467 : * we want to find all the pages missing from
1468 : * the rbio and read them from the disk. If
1469 : * page_in_rbio finds a page in the bio list
1470 : * we don't need to read it off the stripe.
1471 : */
1472 240 : page = page_in_rbio(rbio, stripe, pagenr, 1);
1473 240 : if (page)
1474 56 : continue;
1475 :
1476 : page = rbio_stripe_page(rbio, stripe, pagenr);
1477 : /*
1478 : * the bio cache may have handed us an uptodate
1479 : * page. If so, be happy and use it
1480 : */
1481 184 : if (PageUptodate(page))
1482 60 : continue;
1483 :
1484 124 : ret = rbio_add_io_page(rbio, &bio_list, page,
1485 124 : stripe, pagenr, rbio->stripe_len);
1486 124 : if (ret)
1487 : goto cleanup;
1488 : }
1489 : }
1490 :
1491 12 : bios_to_read = bio_list_size(&bio_list);
1492 6 : if (!bios_to_read) {
1493 : /*
1494 : * this can happen if others have merged with
1495 : * us, it means there is nothing left to read.
1496 : * But if there are missing devices it may not be
1497 : * safe to do the full stripe write yet.
1498 : */
1499 : goto finish;
1500 : }
1501 :
1502 : /*
1503 : * the bbio may be freed once we submit the last bio. Make sure
1504 : * not to touch it after that
1505 : */
1506 : atomic_set(&bbio->stripes_pending, bios_to_read);
1507 : while (1) {
1508 : bio = bio_list_pop(&bio_list);
1509 14 : if (!bio)
1510 : break;
1511 :
1512 10 : bio->bi_private = rbio;
1513 10 : bio->bi_end_io = raid_rmw_end_io;
1514 :
1515 10 : btrfs_bio_wq_end_io(rbio->fs_info, bio,
1516 : BTRFS_WQ_ENDIO_RAID56);
1517 :
1518 10 : BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1519 10 : submit_bio(READ, bio);
1520 10 : }
1521 : /* the actual write will happen once the reads are done */
1522 : return 0;
1523 :
1524 : cleanup:
1525 0 : rbio_orig_end_io(rbio, -EIO, 0);
1526 0 : return -EIO;
1527 :
1528 : finish:
1529 2 : validate_rbio_for_rmw(rbio);
1530 2 : return 0;
1531 : }
1532 :
1533 : /*
1534 : * if the upper layers pass in a full stripe, we thank them by only allocating
1535 : * enough pages to hold the parity, and sending it all down quickly.
1536 : */
1537 32 : static int full_stripe_write(struct btrfs_raid_bio *rbio)
1538 : {
1539 : int ret;
1540 :
1541 32 : ret = alloc_rbio_parity_pages(rbio);
1542 32 : if (ret) {
1543 0 : __free_raid_bio(rbio);
1544 0 : return ret;
1545 : }
1546 :
1547 32 : ret = lock_stripe_add(rbio);
1548 32 : if (ret == 0)
1549 32 : finish_rmw(rbio);
1550 : return 0;
1551 : }
1552 :
1553 : /*
1554 : * partial stripe writes get handed over to async helpers.
1555 : * We're really hoping to merge a few more writes into this
1556 : * rbio before calculating new parity
1557 : */
1558 6 : static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1559 : {
1560 : int ret;
1561 :
1562 6 : ret = lock_stripe_add(rbio);
1563 6 : if (ret == 0)
1564 6 : async_rmw_stripe(rbio);
1565 6 : return 0;
1566 : }
1567 :
1568 : /*
1569 : * sometimes while we were reading from the drive to
1570 : * recalculate parity, enough new bios come into create
1571 : * a full stripe. So we do a check here to see if we can
1572 : * go directly to finish_rmw
1573 : */
1574 6 : static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1575 : {
1576 : /* head off into rmw land if we don't have a full stripe */
1577 6 : if (!rbio_is_full(rbio))
1578 6 : return partial_stripe_write(rbio);
1579 0 : return full_stripe_write(rbio);
1580 : }
1581 :
1582 : /*
1583 : * We use plugging call backs to collect full stripes.
1584 : * Any time we get a partial stripe write while plugged
1585 : * we collect it into a list. When the unplug comes down,
1586 : * we sort the list by logical block number and merge
1587 : * everything we can into the same rbios
1588 : */
1589 : struct btrfs_plug_cb {
1590 : struct blk_plug_cb cb;
1591 : struct btrfs_fs_info *info;
1592 : struct list_head rbio_list;
1593 : struct btrfs_work work;
1594 : };
1595 :
1596 : /*
1597 : * rbios on the plug list are sorted for easier merging.
1598 : */
1599 2 : static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1600 : {
1601 : struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1602 : plug_list);
1603 : struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1604 : plug_list);
1605 2 : u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1606 2 : u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1607 :
1608 2 : if (a_sector < b_sector)
1609 : return -1;
1610 0 : if (a_sector > b_sector)
1611 : return 1;
1612 0 : return 0;
1613 : }
1614 :
1615 4 : static void run_plug(struct btrfs_plug_cb *plug)
1616 : {
1617 : struct btrfs_raid_bio *cur;
1618 : struct btrfs_raid_bio *last = NULL;
1619 :
1620 : /*
1621 : * sort our plug list then try to merge
1622 : * everything we can in hopes of creating full
1623 : * stripes.
1624 : */
1625 4 : list_sort(NULL, &plug->rbio_list, plug_cmp);
1626 14 : while (!list_empty(&plug->rbio_list)) {
1627 6 : cur = list_entry(plug->rbio_list.next,
1628 : struct btrfs_raid_bio, plug_list);
1629 6 : list_del_init(&cur->plug_list);
1630 :
1631 6 : if (rbio_is_full(cur)) {
1632 : /* we have a full stripe, send it down */
1633 0 : full_stripe_write(cur);
1634 0 : continue;
1635 : }
1636 6 : if (last) {
1637 2 : if (rbio_can_merge(last, cur)) {
1638 : merge_rbio(last, cur);
1639 0 : __free_raid_bio(cur);
1640 0 : continue;
1641 :
1642 : }
1643 2 : __raid56_parity_write(last);
1644 : }
1645 : last = cur;
1646 : }
1647 4 : if (last) {
1648 4 : __raid56_parity_write(last);
1649 : }
1650 4 : kfree(plug);
1651 4 : }
1652 :
1653 : /*
1654 : * if the unplug comes from schedule, we have to push the
1655 : * work off to a helper thread
1656 : */
1657 0 : static void unplug_work(struct btrfs_work *work)
1658 : {
1659 : struct btrfs_plug_cb *plug;
1660 0 : plug = container_of(work, struct btrfs_plug_cb, work);
1661 0 : run_plug(plug);
1662 0 : }
1663 :
1664 4 : static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1665 : {
1666 : struct btrfs_plug_cb *plug;
1667 : plug = container_of(cb, struct btrfs_plug_cb, cb);
1668 :
1669 4 : if (from_schedule) {
1670 0 : btrfs_init_work(&plug->work, btrfs_rmw_helper,
1671 : unplug_work, NULL, NULL);
1672 0 : btrfs_queue_work(plug->info->rmw_workers,
1673 : &plug->work);
1674 4 : return;
1675 : }
1676 4 : run_plug(plug);
1677 : }
1678 :
1679 : /*
1680 : * our main entry point for writes from the rest of the FS.
1681 : */
1682 38 : int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1683 : struct btrfs_bio *bbio, u64 *raid_map,
1684 : u64 stripe_len)
1685 : {
1686 : struct btrfs_raid_bio *rbio;
1687 : struct btrfs_plug_cb *plug = NULL;
1688 : struct blk_plug_cb *cb;
1689 :
1690 38 : rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691 38 : if (IS_ERR(rbio))
1692 0 : return PTR_ERR(rbio);
1693 : bio_list_add(&rbio->bio_list, bio);
1694 38 : rbio->bio_list_bytes = bio->bi_iter.bi_size;
1695 :
1696 : /*
1697 : * don't plug on full rbios, just get them out the door
1698 : * as quickly as we can
1699 : */
1700 38 : if (rbio_is_full(rbio))
1701 32 : return full_stripe_write(rbio);
1702 :
1703 6 : cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704 : sizeof(*plug));
1705 6 : if (cb) {
1706 : plug = container_of(cb, struct btrfs_plug_cb, cb);
1707 6 : if (!plug->info) {
1708 4 : plug->info = root->fs_info;
1709 4 : INIT_LIST_HEAD(&plug->rbio_list);
1710 : }
1711 6 : list_add_tail(&rbio->plug_list, &plug->rbio_list);
1712 : } else {
1713 0 : return __raid56_parity_write(rbio);
1714 : }
1715 6 : return 0;
1716 : }
1717 :
1718 : /*
1719 : * all parity reconstruction happens here. We've read in everything
1720 : * we can find from the drives and this does the heavy lifting of
1721 : * sorting the good from the bad.
1722 : */
1723 0 : static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1724 : {
1725 : int pagenr, stripe;
1726 : void **pointers;
1727 : int faila = -1, failb = -1;
1728 0 : int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1729 : struct page *page;
1730 : int err;
1731 : int i;
1732 :
1733 0 : pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1734 : GFP_NOFS);
1735 0 : if (!pointers) {
1736 : err = -ENOMEM;
1737 : goto cleanup_io;
1738 : }
1739 :
1740 0 : faila = rbio->faila;
1741 0 : failb = rbio->failb;
1742 :
1743 0 : if (rbio->read_rebuild) {
1744 : spin_lock_irq(&rbio->bio_list_lock);
1745 : set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746 : spin_unlock_irq(&rbio->bio_list_lock);
1747 : }
1748 :
1749 0 : index_rbio_pages(rbio);
1750 :
1751 0 : for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1752 : /* setup our array of pointers with pages
1753 : * from each stripe
1754 : */
1755 0 : for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1756 : /*
1757 : * if we're rebuilding a read, we have to use
1758 : * pages from the bio list
1759 : */
1760 0 : if (rbio->read_rebuild &&
1761 0 : (stripe == faila || stripe == failb)) {
1762 0 : page = page_in_rbio(rbio, stripe, pagenr, 0);
1763 : } else {
1764 : page = rbio_stripe_page(rbio, stripe, pagenr);
1765 : }
1766 0 : pointers[stripe] = kmap(page);
1767 : }
1768 :
1769 : /* all raid6 handling here */
1770 0 : if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1771 : RAID6_Q_STRIPE) {
1772 :
1773 : /*
1774 : * single failure, rebuild from parity raid5
1775 : * style
1776 : */
1777 0 : if (failb < 0) {
1778 0 : if (faila == rbio->nr_data) {
1779 : /*
1780 : * Just the P stripe has failed, without
1781 : * a bad data or Q stripe.
1782 : * TODO, we should redo the xor here.
1783 : */
1784 : err = -EIO;
1785 : goto cleanup;
1786 : }
1787 : /*
1788 : * a single failure in raid6 is rebuilt
1789 : * in the pstripe code below
1790 : */
1791 : goto pstripe;
1792 : }
1793 :
1794 : /* make sure our ps and qs are in order */
1795 0 : if (faila > failb) {
1796 : int tmp = failb;
1797 : failb = faila;
1798 : faila = tmp;
1799 : }
1800 :
1801 : /* if the q stripe is failed, do a pstripe reconstruction
1802 : * from the xors.
1803 : * If both the q stripe and the P stripe are failed, we're
1804 : * here due to a crc mismatch and we can't give them the
1805 : * data they want
1806 : */
1807 0 : if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1808 0 : if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1809 : err = -EIO;
1810 : goto cleanup;
1811 : }
1812 : /*
1813 : * otherwise we have one bad data stripe and
1814 : * a good P stripe. raid5!
1815 : */
1816 : goto pstripe;
1817 : }
1818 :
1819 0 : if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820 0 : raid6_datap_recov(rbio->bbio->num_stripes,
1821 : PAGE_SIZE, faila, pointers);
1822 : } else {
1823 0 : raid6_2data_recov(rbio->bbio->num_stripes,
1824 : PAGE_SIZE, faila, failb,
1825 : pointers);
1826 : }
1827 : } else {
1828 : void *p;
1829 :
1830 : /* rebuild from P stripe here (raid5 or raid6) */
1831 0 : BUG_ON(failb != -1);
1832 : pstripe:
1833 : /* Copy parity block into failed block to start with */
1834 0 : memcpy(pointers[faila],
1835 0 : pointers[rbio->nr_data],
1836 : PAGE_CACHE_SIZE);
1837 :
1838 : /* rearrange the pointer array */
1839 0 : p = pointers[faila];
1840 0 : for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1841 0 : pointers[stripe] = pointers[stripe + 1];
1842 0 : pointers[rbio->nr_data - 1] = p;
1843 :
1844 : /* xor in the rest */
1845 0 : run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1846 : }
1847 : /* if we're doing this rebuild as part of an rmw, go through
1848 : * and set all of our private rbio pages in the
1849 : * failed stripes as uptodate. This way finish_rmw will
1850 : * know they can be trusted. If this was a read reconstruction,
1851 : * other endio functions will fiddle the uptodate bits
1852 : */
1853 0 : if (!rbio->read_rebuild) {
1854 0 : for (i = 0; i < nr_pages; i++) {
1855 0 : if (faila != -1) {
1856 : page = rbio_stripe_page(rbio, faila, i);
1857 : SetPageUptodate(page);
1858 : }
1859 0 : if (failb != -1) {
1860 : page = rbio_stripe_page(rbio, failb, i);
1861 : SetPageUptodate(page);
1862 : }
1863 : }
1864 : }
1865 0 : for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1866 : /*
1867 : * if we're rebuilding a read, we have to use
1868 : * pages from the bio list
1869 : */
1870 0 : if (rbio->read_rebuild &&
1871 0 : (stripe == faila || stripe == failb)) {
1872 0 : page = page_in_rbio(rbio, stripe, pagenr, 0);
1873 : } else {
1874 : page = rbio_stripe_page(rbio, stripe, pagenr);
1875 : }
1876 : kunmap(page);
1877 : }
1878 : }
1879 :
1880 : err = 0;
1881 : cleanup:
1882 0 : kfree(pointers);
1883 :
1884 : cleanup_io:
1885 :
1886 0 : if (rbio->read_rebuild) {
1887 0 : if (err == 0)
1888 0 : cache_rbio_pages(rbio);
1889 : else
1890 : clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1891 :
1892 0 : rbio_orig_end_io(rbio, err, err == 0);
1893 0 : } else if (err == 0) {
1894 0 : rbio->faila = -1;
1895 0 : rbio->failb = -1;
1896 0 : finish_rmw(rbio);
1897 : } else {
1898 0 : rbio_orig_end_io(rbio, err, 0);
1899 : }
1900 0 : }
1901 :
1902 : /*
1903 : * This is called only for stripes we've read from disk to
1904 : * reconstruct the parity.
1905 : */
1906 0 : static void raid_recover_end_io(struct bio *bio, int err)
1907 : {
1908 0 : struct btrfs_raid_bio *rbio = bio->bi_private;
1909 :
1910 : /*
1911 : * we only read stripe pages off the disk, set them
1912 : * up to date if there were no errors
1913 : */
1914 0 : if (err)
1915 0 : fail_bio_stripe(rbio, bio);
1916 : else
1917 : set_bio_pages_uptodate(bio);
1918 0 : bio_put(bio);
1919 :
1920 0 : if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1921 0 : return;
1922 :
1923 0 : if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1924 0 : rbio_orig_end_io(rbio, -EIO, 0);
1925 : else
1926 0 : __raid_recover_end_io(rbio);
1927 : }
1928 :
1929 : /*
1930 : * reads everything we need off the disk to reconstruct
1931 : * the parity. endio handlers trigger final reconstruction
1932 : * when the IO is done.
1933 : *
1934 : * This is used both for reads from the higher layers and for
1935 : * parity construction required to finish a rmw cycle.
1936 : */
1937 0 : static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938 : {
1939 : int bios_to_read = 0;
1940 0 : struct btrfs_bio *bbio = rbio->bbio;
1941 : struct bio_list bio_list;
1942 : int ret;
1943 0 : int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1944 : int pagenr;
1945 : int stripe;
1946 : struct bio *bio;
1947 :
1948 : bio_list_init(&bio_list);
1949 :
1950 0 : ret = alloc_rbio_pages(rbio);
1951 0 : if (ret)
1952 : goto cleanup;
1953 :
1954 0 : atomic_set(&rbio->bbio->error, 0);
1955 :
1956 : /*
1957 : * read everything that hasn't failed. Thanks to the
1958 : * stripe cache, it is possible that some or all of these
1959 : * pages are going to be uptodate.
1960 : */
1961 0 : for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1962 0 : if (rbio->faila == stripe || rbio->failb == stripe) {
1963 0 : atomic_inc(&rbio->bbio->error);
1964 0 : continue;
1965 : }
1966 :
1967 0 : for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1968 : struct page *p;
1969 :
1970 : /*
1971 : * the rmw code may have already read this
1972 : * page in
1973 : */
1974 : p = rbio_stripe_page(rbio, stripe, pagenr);
1975 0 : if (PageUptodate(p))
1976 0 : continue;
1977 :
1978 0 : ret = rbio_add_io_page(rbio, &bio_list,
1979 : rbio_stripe_page(rbio, stripe, pagenr),
1980 0 : stripe, pagenr, rbio->stripe_len);
1981 0 : if (ret < 0)
1982 : goto cleanup;
1983 : }
1984 : }
1985 :
1986 0 : bios_to_read = bio_list_size(&bio_list);
1987 0 : if (!bios_to_read) {
1988 : /*
1989 : * we might have no bios to read just because the pages
1990 : * were up to date, or we might have no bios to read because
1991 : * the devices were gone.
1992 : */
1993 0 : if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1994 0 : __raid_recover_end_io(rbio);
1995 0 : goto out;
1996 : } else {
1997 : goto cleanup;
1998 : }
1999 : }
2000 :
2001 : /*
2002 : * the bbio may be freed once we submit the last bio. Make sure
2003 : * not to touch it after that
2004 : */
2005 : atomic_set(&bbio->stripes_pending, bios_to_read);
2006 : while (1) {
2007 : bio = bio_list_pop(&bio_list);
2008 0 : if (!bio)
2009 : break;
2010 :
2011 0 : bio->bi_private = rbio;
2012 0 : bio->bi_end_io = raid_recover_end_io;
2013 :
2014 0 : btrfs_bio_wq_end_io(rbio->fs_info, bio,
2015 : BTRFS_WQ_ENDIO_RAID56);
2016 :
2017 0 : BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2018 0 : submit_bio(READ, bio);
2019 0 : }
2020 : out:
2021 : return 0;
2022 :
2023 : cleanup:
2024 0 : if (rbio->read_rebuild)
2025 0 : rbio_orig_end_io(rbio, -EIO, 0);
2026 : return -EIO;
2027 : }
2028 :
2029 : /*
2030 : * the main entry point for reads from the higher layers. This
2031 : * is really only called when the normal read path had a failure,
2032 : * so we assume the bio they send down corresponds to a failed part
2033 : * of the drive.
2034 : */
2035 0 : int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2036 : struct btrfs_bio *bbio, u64 *raid_map,
2037 : u64 stripe_len, int mirror_num)
2038 : {
2039 0 : struct btrfs_raid_bio *rbio;
2040 : int ret;
2041 :
2042 0 : rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2043 0 : if (IS_ERR(rbio))
2044 0 : return PTR_ERR(rbio);
2045 :
2046 0 : rbio->read_rebuild = 1;
2047 : bio_list_add(&rbio->bio_list, bio);
2048 0 : rbio->bio_list_bytes = bio->bi_iter.bi_size;
2049 :
2050 0 : rbio->faila = find_logical_bio_stripe(rbio, bio);
2051 0 : if (rbio->faila == -1) {
2052 0 : BUG();
2053 : kfree(raid_map);
2054 : kfree(bbio);
2055 : kfree(rbio);
2056 : return -EIO;
2057 : }
2058 :
2059 : /*
2060 : * reconstruct from the q stripe if they are
2061 : * asking for mirror 3
2062 : */
2063 0 : if (mirror_num == 3)
2064 0 : rbio->failb = bbio->num_stripes - 2;
2065 :
2066 0 : ret = lock_stripe_add(rbio);
2067 :
2068 : /*
2069 : * __raid56_parity_recover will end the bio with
2070 : * any errors it hits. We don't want to return
2071 : * its error value up the stack because our caller
2072 : * will end up calling bio_endio with any nonzero
2073 : * return
2074 : */
2075 0 : if (ret == 0)
2076 0 : __raid56_parity_recover(rbio);
2077 : /*
2078 : * our rbio has been added to the list of
2079 : * rbios that will be handled after the
2080 : * currently lock owner is done
2081 : */
2082 : return 0;
2083 :
2084 : }
2085 :
2086 6 : static void rmw_work(struct btrfs_work *work)
2087 : {
2088 : struct btrfs_raid_bio *rbio;
2089 :
2090 6 : rbio = container_of(work, struct btrfs_raid_bio, work);
2091 6 : raid56_rmw_stripe(rbio);
2092 6 : }
2093 :
2094 0 : static void read_rebuild_work(struct btrfs_work *work)
2095 : {
2096 : struct btrfs_raid_bio *rbio;
2097 :
2098 0 : rbio = container_of(work, struct btrfs_raid_bio, work);
2099 0 : __raid56_parity_recover(rbio);
2100 0 : }
|